diff options
Diffstat (limited to 'drivers/md')
43 files changed, 3987 insertions, 1098 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 09c0c6e49ab5..2158377a1359 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -257,6 +257,17 @@ config DM_MIRROR | |||
257 | Allow volume managers to mirror logical volumes, also | 257 | Allow volume managers to mirror logical volumes, also |
258 | needed for live data migration tools such as 'pvmove'. | 258 | needed for live data migration tools such as 'pvmove'. |
259 | 259 | ||
260 | config DM_LOG_USERSPACE | ||
261 | tristate "Mirror userspace logging (EXPERIMENTAL)" | ||
262 | depends on DM_MIRROR && EXPERIMENTAL && NET | ||
263 | select CONNECTOR | ||
264 | ---help--- | ||
265 | The userspace logging module provides a mechanism for | ||
266 | relaying the dm-dirty-log API to userspace. Log designs | ||
267 | which are more suited to userspace implementation (e.g. | ||
268 | shared storage logs) or experimental logs can be implemented | ||
269 | by leveraging this framework. | ||
270 | |||
260 | config DM_ZERO | 271 | config DM_ZERO |
261 | tristate "Zero target" | 272 | tristate "Zero target" |
262 | depends on BLK_DEV_DM | 273 | depends on BLK_DEV_DM |
@@ -275,6 +286,25 @@ config DM_MULTIPATH | |||
275 | ---help--- | 286 | ---help--- |
276 | Allow volume managers to support multipath hardware. | 287 | Allow volume managers to support multipath hardware. |
277 | 288 | ||
289 | config DM_MULTIPATH_QL | ||
290 | tristate "I/O Path Selector based on the number of in-flight I/Os" | ||
291 | depends on DM_MULTIPATH | ||
292 | ---help--- | ||
293 | This path selector is a dynamic load balancer which selects | ||
294 | the path with the least number of in-flight I/Os. | ||
295 | |||
296 | If unsure, say N. | ||
297 | |||
298 | config DM_MULTIPATH_ST | ||
299 | tristate "I/O Path Selector based on the service time" | ||
300 | depends on DM_MULTIPATH | ||
301 | ---help--- | ||
302 | This path selector is a dynamic load balancer which selects | ||
303 | the path expected to complete the incoming I/O in the shortest | ||
304 | time. | ||
305 | |||
306 | If unsure, say N. | ||
307 | |||
278 | config DM_DELAY | 308 | config DM_DELAY |
279 | tristate "I/O delaying target (EXPERIMENTAL)" | 309 | tristate "I/O delaying target (EXPERIMENTAL)" |
280 | depends on BLK_DEV_DM && EXPERIMENTAL | 310 | depends on BLK_DEV_DM && EXPERIMENTAL |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 45cc5951d928..1dc4185bd781 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -8,6 +8,8 @@ dm-multipath-y += dm-path-selector.o dm-mpath.o | |||
8 | dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ | 8 | dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ |
9 | dm-snap-persistent.o | 9 | dm-snap-persistent.o |
10 | dm-mirror-y += dm-raid1.o | 10 | dm-mirror-y += dm-raid1.o |
11 | dm-log-userspace-y \ | ||
12 | += dm-log-userspace-base.o dm-log-userspace-transfer.o | ||
11 | md-mod-y += md.o bitmap.o | 13 | md-mod-y += md.o bitmap.o |
12 | raid456-y += raid5.o | 14 | raid456-y += raid5.o |
13 | raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ | 15 | raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ |
@@ -36,8 +38,11 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | |||
36 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o | 38 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o |
37 | obj-$(CONFIG_DM_DELAY) += dm-delay.o | 39 | obj-$(CONFIG_DM_DELAY) += dm-delay.o |
38 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o | 40 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o |
41 | obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o | ||
42 | obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o | ||
39 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | 43 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o |
40 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o | 44 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o |
45 | obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o | ||
41 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | 46 | obj-$(CONFIG_DM_ZERO) += dm-zero.o |
42 | 47 | ||
43 | quiet_cmd_unroll = UNROLL $@ | 48 | quiet_cmd_unroll = UNROLL $@ |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 56df1cee8fb3..3319c2fec28e 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -232,7 +232,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, | |||
232 | target = rdev->sb_start + offset + index * (PAGE_SIZE/512); | 232 | target = rdev->sb_start + offset + index * (PAGE_SIZE/512); |
233 | 233 | ||
234 | if (sync_page_io(rdev->bdev, target, | 234 | if (sync_page_io(rdev->bdev, target, |
235 | roundup(size, bdev_hardsect_size(rdev->bdev)), | 235 | roundup(size, bdev_logical_block_size(rdev->bdev)), |
236 | page, READ)) { | 236 | page, READ)) { |
237 | page->index = index; | 237 | page->index = index; |
238 | attach_page_buffers(page, NULL); /* so that free_buffer will | 238 | attach_page_buffers(page, NULL); /* so that free_buffer will |
@@ -287,7 +287,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
287 | int size = PAGE_SIZE; | 287 | int size = PAGE_SIZE; |
288 | if (page->index == bitmap->file_pages-1) | 288 | if (page->index == bitmap->file_pages-1) |
289 | size = roundup(bitmap->last_page_size, | 289 | size = roundup(bitmap->last_page_size, |
290 | bdev_hardsect_size(rdev->bdev)); | 290 | bdev_logical_block_size(rdev->bdev)); |
291 | /* Just make sure we aren't corrupting data or | 291 | /* Just make sure we aren't corrupting data or |
292 | * metadata | 292 | * metadata |
293 | */ | 293 | */ |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 53394e863c74..9933eb861c71 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1132 | goto bad_crypt_queue; | 1132 | goto bad_crypt_queue; |
1133 | } | 1133 | } |
1134 | 1134 | ||
1135 | ti->num_flush_requests = 1; | ||
1135 | ti->private = cc; | 1136 | ti->private = cc; |
1136 | return 0; | 1137 | return 0; |
1137 | 1138 | ||
@@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, | |||
1189 | union map_info *map_context) | 1190 | union map_info *map_context) |
1190 | { | 1191 | { |
1191 | struct dm_crypt_io *io; | 1192 | struct dm_crypt_io *io; |
1193 | struct crypt_config *cc; | ||
1194 | |||
1195 | if (unlikely(bio_empty_barrier(bio))) { | ||
1196 | cc = ti->private; | ||
1197 | bio->bi_bdev = cc->dev->bdev; | ||
1198 | return DM_MAPIO_REMAPPED; | ||
1199 | } | ||
1192 | 1200 | ||
1193 | io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); | 1201 | io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); |
1194 | 1202 | ||
@@ -1305,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
1305 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 1313 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
1306 | } | 1314 | } |
1307 | 1315 | ||
1316 | static int crypt_iterate_devices(struct dm_target *ti, | ||
1317 | iterate_devices_callout_fn fn, void *data) | ||
1318 | { | ||
1319 | struct crypt_config *cc = ti->private; | ||
1320 | |||
1321 | return fn(ti, cc->dev, cc->start, data); | ||
1322 | } | ||
1323 | |||
1308 | static struct target_type crypt_target = { | 1324 | static struct target_type crypt_target = { |
1309 | .name = "crypt", | 1325 | .name = "crypt", |
1310 | .version= {1, 6, 0}, | 1326 | .version = {1, 7, 0}, |
1311 | .module = THIS_MODULE, | 1327 | .module = THIS_MODULE, |
1312 | .ctr = crypt_ctr, | 1328 | .ctr = crypt_ctr, |
1313 | .dtr = crypt_dtr, | 1329 | .dtr = crypt_dtr, |
@@ -1318,6 +1334,7 @@ static struct target_type crypt_target = { | |||
1318 | .resume = crypt_resume, | 1334 | .resume = crypt_resume, |
1319 | .message = crypt_message, | 1335 | .message = crypt_message, |
1320 | .merge = crypt_merge, | 1336 | .merge = crypt_merge, |
1337 | .iterate_devices = crypt_iterate_devices, | ||
1321 | }; | 1338 | }; |
1322 | 1339 | ||
1323 | static int __init dm_crypt_init(void) | 1340 | static int __init dm_crypt_init(void) |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 559dbb52bc85..4e5b843cd4d7 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
@@ -197,6 +197,7 @@ out: | |||
197 | mutex_init(&dc->timer_lock); | 197 | mutex_init(&dc->timer_lock); |
198 | atomic_set(&dc->may_delay, 1); | 198 | atomic_set(&dc->may_delay, 1); |
199 | 199 | ||
200 | ti->num_flush_requests = 1; | ||
200 | ti->private = dc; | 201 | ti->private = dc; |
201 | return 0; | 202 | return 0; |
202 | 203 | ||
@@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio, | |||
278 | 279 | ||
279 | if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { | 280 | if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { |
280 | bio->bi_bdev = dc->dev_write->bdev; | 281 | bio->bi_bdev = dc->dev_write->bdev; |
281 | bio->bi_sector = dc->start_write + | 282 | if (bio_sectors(bio)) |
282 | (bio->bi_sector - ti->begin); | 283 | bio->bi_sector = dc->start_write + |
284 | (bio->bi_sector - ti->begin); | ||
283 | 285 | ||
284 | return delay_bio(dc, dc->write_delay, bio); | 286 | return delay_bio(dc, dc->write_delay, bio); |
285 | } | 287 | } |
@@ -316,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type, | |||
316 | return 0; | 318 | return 0; |
317 | } | 319 | } |
318 | 320 | ||
321 | static int delay_iterate_devices(struct dm_target *ti, | ||
322 | iterate_devices_callout_fn fn, void *data) | ||
323 | { | ||
324 | struct delay_c *dc = ti->private; | ||
325 | int ret = 0; | ||
326 | |||
327 | ret = fn(ti, dc->dev_read, dc->start_read, data); | ||
328 | if (ret) | ||
329 | goto out; | ||
330 | |||
331 | if (dc->dev_write) | ||
332 | ret = fn(ti, dc->dev_write, dc->start_write, data); | ||
333 | |||
334 | out: | ||
335 | return ret; | ||
336 | } | ||
337 | |||
319 | static struct target_type delay_target = { | 338 | static struct target_type delay_target = { |
320 | .name = "delay", | 339 | .name = "delay", |
321 | .version = {1, 0, 2}, | 340 | .version = {1, 1, 0}, |
322 | .module = THIS_MODULE, | 341 | .module = THIS_MODULE, |
323 | .ctr = delay_ctr, | 342 | .ctr = delay_ctr, |
324 | .dtr = delay_dtr, | 343 | .dtr = delay_dtr, |
@@ -326,6 +345,7 @@ static struct target_type delay_target = { | |||
326 | .presuspend = delay_presuspend, | 345 | .presuspend = delay_presuspend, |
327 | .resume = delay_resume, | 346 | .resume = delay_resume, |
328 | .status = delay_status, | 347 | .status = delay_status, |
348 | .iterate_devices = delay_iterate_devices, | ||
329 | }; | 349 | }; |
330 | 350 | ||
331 | static int __init dm_delay_init(void) | 351 | static int __init dm_delay_init(void) |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index a2e26c242141..c3ae51584b12 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
@@ -178,7 +178,7 @@ static int set_chunk_size(struct dm_exception_store *store, | |||
178 | } | 178 | } |
179 | 179 | ||
180 | /* Validate the chunk size against the device block size */ | 180 | /* Validate the chunk size against the device block size */ |
181 | if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) { | 181 | if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) { |
182 | *error = "Chunk size is not a multiple of device blocksize"; | 182 | *error = "Chunk size is not a multiple of device blocksize"; |
183 | return -EINVAL; | 183 | return -EINVAL; |
184 | } | 184 | } |
@@ -216,7 +216,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
216 | return -EINVAL; | 216 | return -EINVAL; |
217 | } | 217 | } |
218 | 218 | ||
219 | type = get_type(argv[1]); | 219 | type = get_type(&persistent); |
220 | if (!type) { | 220 | if (!type) { |
221 | ti->error = "Exception store type not recognised"; | 221 | ti->error = "Exception store type not recognised"; |
222 | r = -EINVAL; | 222 | r = -EINVAL; |
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 0a2e6e7f67b3..2442c8c07898 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h | |||
@@ -111,7 +111,7 @@ struct dm_exception_store { | |||
111 | /* | 111 | /* |
112 | * Funtions to manipulate consecutive chunks | 112 | * Funtions to manipulate consecutive chunks |
113 | */ | 113 | */ |
114 | # if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) | 114 | # if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) |
115 | # define DM_CHUNK_CONSECUTIVE_BITS 8 | 115 | # define DM_CHUNK_CONSECUTIVE_BITS 8 |
116 | # define DM_CHUNK_NUMBER_BITS 56 | 116 | # define DM_CHUNK_NUMBER_BITS 56 |
117 | 117 | ||
@@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) | |||
156 | */ | 156 | */ |
157 | static inline sector_t get_dev_size(struct block_device *bdev) | 157 | static inline sector_t get_dev_size(struct block_device *bdev) |
158 | { | 158 | { |
159 | return bdev->bd_inode->i_size >> SECTOR_SHIFT; | 159 | return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; |
160 | } | 160 | } |
161 | 161 | ||
162 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, | 162 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, |
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index e73aabd61cd7..3a2e6a2f8bdd 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c | |||
@@ -22,6 +22,7 @@ struct dm_io_client { | |||
22 | /* FIXME: can we shrink this ? */ | 22 | /* FIXME: can we shrink this ? */ |
23 | struct io { | 23 | struct io { |
24 | unsigned long error_bits; | 24 | unsigned long error_bits; |
25 | unsigned long eopnotsupp_bits; | ||
25 | atomic_t count; | 26 | atomic_t count; |
26 | struct task_struct *sleeper; | 27 | struct task_struct *sleeper; |
27 | struct dm_io_client *client; | 28 | struct dm_io_client *client; |
@@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio) | |||
107 | *---------------------------------------------------------------*/ | 108 | *---------------------------------------------------------------*/ |
108 | static void dec_count(struct io *io, unsigned int region, int error) | 109 | static void dec_count(struct io *io, unsigned int region, int error) |
109 | { | 110 | { |
110 | if (error) | 111 | if (error) { |
111 | set_bit(region, &io->error_bits); | 112 | set_bit(region, &io->error_bits); |
113 | if (error == -EOPNOTSUPP) | ||
114 | set_bit(region, &io->eopnotsupp_bits); | ||
115 | } | ||
112 | 116 | ||
113 | if (atomic_dec_and_test(&io->count)) { | 117 | if (atomic_dec_and_test(&io->count)) { |
114 | if (io->sleeper) | 118 | if (io->sleeper) |
@@ -360,7 +364,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
360 | return -EIO; | 364 | return -EIO; |
361 | } | 365 | } |
362 | 366 | ||
367 | retry: | ||
363 | io.error_bits = 0; | 368 | io.error_bits = 0; |
369 | io.eopnotsupp_bits = 0; | ||
364 | atomic_set(&io.count, 1); /* see dispatch_io() */ | 370 | atomic_set(&io.count, 1); /* see dispatch_io() */ |
365 | io.sleeper = current; | 371 | io.sleeper = current; |
366 | io.client = client; | 372 | io.client = client; |
@@ -377,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
377 | } | 383 | } |
378 | set_current_state(TASK_RUNNING); | 384 | set_current_state(TASK_RUNNING); |
379 | 385 | ||
386 | if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { | ||
387 | rw &= ~(1 << BIO_RW_BARRIER); | ||
388 | goto retry; | ||
389 | } | ||
390 | |||
380 | if (error_bits) | 391 | if (error_bits) |
381 | *error_bits = io.error_bits; | 392 | *error_bits = io.error_bits; |
382 | 393 | ||
@@ -397,6 +408,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, | |||
397 | 408 | ||
398 | io = mempool_alloc(client->pool, GFP_NOIO); | 409 | io = mempool_alloc(client->pool, GFP_NOIO); |
399 | io->error_bits = 0; | 410 | io->error_bits = 0; |
411 | io->eopnotsupp_bits = 0; | ||
400 | atomic_set(&io->count, 1); /* see dispatch_io() */ | 412 | atomic_set(&io->count, 1); /* see dispatch_io() */ |
401 | io->sleeper = NULL; | 413 | io->sleeper = NULL; |
402 | io->client = client; | 414 | io->client = client; |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 823ceba6efa8..7f77f18fcafa 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -276,7 +276,7 @@ retry: | |||
276 | up_write(&_hash_lock); | 276 | up_write(&_hash_lock); |
277 | } | 277 | } |
278 | 278 | ||
279 | static int dm_hash_rename(const char *old, const char *new) | 279 | static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) |
280 | { | 280 | { |
281 | char *new_name, *old_name; | 281 | char *new_name, *old_name; |
282 | struct hash_cell *hc; | 282 | struct hash_cell *hc; |
@@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new) | |||
333 | dm_table_put(table); | 333 | dm_table_put(table); |
334 | } | 334 | } |
335 | 335 | ||
336 | dm_kobject_uevent(hc->md); | 336 | dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie); |
337 | 337 | ||
338 | dm_put(hc->md); | 338 | dm_put(hc->md); |
339 | up_write(&_hash_lock); | 339 | up_write(&_hash_lock); |
@@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) | |||
680 | 680 | ||
681 | __hash_remove(hc); | 681 | __hash_remove(hc); |
682 | up_write(&_hash_lock); | 682 | up_write(&_hash_lock); |
683 | |||
684 | dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr); | ||
685 | |||
683 | dm_put(md); | 686 | dm_put(md); |
684 | param->data_size = 0; | 687 | param->data_size = 0; |
685 | return 0; | 688 | return 0; |
@@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) | |||
715 | return r; | 718 | return r; |
716 | 719 | ||
717 | param->data_size = 0; | 720 | param->data_size = 0; |
718 | return dm_hash_rename(param->name, new_name); | 721 | return dm_hash_rename(param->event_nr, param->name, new_name); |
719 | } | 722 | } |
720 | 723 | ||
721 | static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | 724 | static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) |
@@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param) | |||
842 | if (dm_suspended(md)) | 845 | if (dm_suspended(md)) |
843 | r = dm_resume(md); | 846 | r = dm_resume(md); |
844 | 847 | ||
845 | if (!r) | 848 | |
849 | if (!r) { | ||
850 | dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); | ||
846 | r = __dev_status(md, param); | 851 | r = __dev_status(md, param); |
852 | } | ||
847 | 853 | ||
848 | dm_put(md); | 854 | dm_put(md); |
849 | return r; | 855 | return r; |
@@ -1044,6 +1050,12 @@ static int populate_table(struct dm_table *table, | |||
1044 | next = spec->next; | 1050 | next = spec->next; |
1045 | } | 1051 | } |
1046 | 1052 | ||
1053 | r = dm_table_set_type(table); | ||
1054 | if (r) { | ||
1055 | DMWARN("unable to set table type"); | ||
1056 | return r; | ||
1057 | } | ||
1058 | |||
1047 | return dm_table_complete(table); | 1059 | return dm_table_complete(table); |
1048 | } | 1060 | } |
1049 | 1061 | ||
@@ -1089,6 +1101,13 @@ static int table_load(struct dm_ioctl *param, size_t param_size) | |||
1089 | goto out; | 1101 | goto out; |
1090 | } | 1102 | } |
1091 | 1103 | ||
1104 | r = dm_table_alloc_md_mempools(t); | ||
1105 | if (r) { | ||
1106 | DMWARN("unable to allocate mempools for this table"); | ||
1107 | dm_table_destroy(t); | ||
1108 | goto out; | ||
1109 | } | ||
1110 | |||
1092 | down_write(&_hash_lock); | 1111 | down_write(&_hash_lock); |
1093 | hc = dm_get_mdptr(md); | 1112 | hc = dm_get_mdptr(md); |
1094 | if (!hc || hc->md != md) { | 1113 | if (!hc || hc->md != md) { |
@@ -1513,6 +1532,7 @@ static const struct file_operations _ctl_fops = { | |||
1513 | static struct miscdevice _dm_misc = { | 1532 | static struct miscdevice _dm_misc = { |
1514 | .minor = MISC_DYNAMIC_MINOR, | 1533 | .minor = MISC_DYNAMIC_MINOR, |
1515 | .name = DM_NAME, | 1534 | .name = DM_NAME, |
1535 | .devnode = "mapper/control", | ||
1516 | .fops = &_ctl_fops | 1536 | .fops = &_ctl_fops |
1517 | }; | 1537 | }; |
1518 | 1538 | ||
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 79fb53e51c70..9184b6deb868 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
@@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
53 | goto bad; | 53 | goto bad; |
54 | } | 54 | } |
55 | 55 | ||
56 | ti->num_flush_requests = 1; | ||
56 | ti->private = lc; | 57 | ti->private = lc; |
57 | return 0; | 58 | return 0; |
58 | 59 | ||
@@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) | |||
81 | struct linear_c *lc = ti->private; | 82 | struct linear_c *lc = ti->private; |
82 | 83 | ||
83 | bio->bi_bdev = lc->dev->bdev; | 84 | bio->bi_bdev = lc->dev->bdev; |
84 | bio->bi_sector = linear_map_sector(ti, bio->bi_sector); | 85 | if (bio_sectors(bio)) |
86 | bio->bi_sector = linear_map_sector(ti, bio->bi_sector); | ||
85 | } | 87 | } |
86 | 88 | ||
87 | static int linear_map(struct dm_target *ti, struct bio *bio, | 89 | static int linear_map(struct dm_target *ti, struct bio *bio, |
@@ -132,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
132 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 134 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
133 | } | 135 | } |
134 | 136 | ||
137 | static int linear_iterate_devices(struct dm_target *ti, | ||
138 | iterate_devices_callout_fn fn, void *data) | ||
139 | { | ||
140 | struct linear_c *lc = ti->private; | ||
141 | |||
142 | return fn(ti, lc->dev, lc->start, data); | ||
143 | } | ||
144 | |||
135 | static struct target_type linear_target = { | 145 | static struct target_type linear_target = { |
136 | .name = "linear", | 146 | .name = "linear", |
137 | .version= {1, 0, 3}, | 147 | .version = {1, 1, 0}, |
138 | .module = THIS_MODULE, | 148 | .module = THIS_MODULE, |
139 | .ctr = linear_ctr, | 149 | .ctr = linear_ctr, |
140 | .dtr = linear_dtr, | 150 | .dtr = linear_dtr, |
@@ -142,6 +152,7 @@ static struct target_type linear_target = { | |||
142 | .status = linear_status, | 152 | .status = linear_status, |
143 | .ioctl = linear_ioctl, | 153 | .ioctl = linear_ioctl, |
144 | .merge = linear_merge, | 154 | .merge = linear_merge, |
155 | .iterate_devices = linear_iterate_devices, | ||
145 | }; | 156 | }; |
146 | 157 | ||
147 | int __init dm_linear_init(void) | 158 | int __init dm_linear_init(void) |
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c new file mode 100644 index 000000000000..e69b96560997 --- /dev/null +++ b/drivers/md/dm-log-userspace-base.c | |||
@@ -0,0 +1,696 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
3 | * | ||
4 | * This file is released under the LGPL. | ||
5 | */ | ||
6 | |||
7 | #include <linux/bio.h> | ||
8 | #include <linux/dm-dirty-log.h> | ||
9 | #include <linux/device-mapper.h> | ||
10 | #include <linux/dm-log-userspace.h> | ||
11 | |||
12 | #include "dm-log-userspace-transfer.h" | ||
13 | |||
14 | struct flush_entry { | ||
15 | int type; | ||
16 | region_t region; | ||
17 | struct list_head list; | ||
18 | }; | ||
19 | |||
20 | struct log_c { | ||
21 | struct dm_target *ti; | ||
22 | uint32_t region_size; | ||
23 | region_t region_count; | ||
24 | char uuid[DM_UUID_LEN]; | ||
25 | |||
26 | char *usr_argv_str; | ||
27 | uint32_t usr_argc; | ||
28 | |||
29 | /* | ||
30 | * in_sync_hint gets set when doing is_remote_recovering. It | ||
31 | * represents the first region that needs recovery. IOW, the | ||
32 | * first zero bit of sync_bits. This can be useful for to limit | ||
33 | * traffic for calls like is_remote_recovering and get_resync_work, | ||
34 | * but be take care in its use for anything else. | ||
35 | */ | ||
36 | uint64_t in_sync_hint; | ||
37 | |||
38 | spinlock_t flush_lock; | ||
39 | struct list_head flush_list; /* only for clear and mark requests */ | ||
40 | }; | ||
41 | |||
42 | static mempool_t *flush_entry_pool; | ||
43 | |||
44 | static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) | ||
45 | { | ||
46 | return kmalloc(sizeof(struct flush_entry), gfp_mask); | ||
47 | } | ||
48 | |||
49 | static void flush_entry_free(void *element, void *pool_data) | ||
50 | { | ||
51 | kfree(element); | ||
52 | } | ||
53 | |||
54 | static int userspace_do_request(struct log_c *lc, const char *uuid, | ||
55 | int request_type, char *data, size_t data_size, | ||
56 | char *rdata, size_t *rdata_size) | ||
57 | { | ||
58 | int r; | ||
59 | |||
60 | /* | ||
61 | * If the server isn't there, -ESRCH is returned, | ||
62 | * and we must keep trying until the server is | ||
63 | * restored. | ||
64 | */ | ||
65 | retry: | ||
66 | r = dm_consult_userspace(uuid, request_type, data, | ||
67 | data_size, rdata, rdata_size); | ||
68 | |||
69 | if (r != -ESRCH) | ||
70 | return r; | ||
71 | |||
72 | DMERR(" Userspace log server not found."); | ||
73 | while (1) { | ||
74 | set_current_state(TASK_INTERRUPTIBLE); | ||
75 | schedule_timeout(2*HZ); | ||
76 | DMWARN("Attempting to contact userspace log server..."); | ||
77 | r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str, | ||
78 | strlen(lc->usr_argv_str) + 1, | ||
79 | NULL, NULL); | ||
80 | if (!r) | ||
81 | break; | ||
82 | } | ||
83 | DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); | ||
84 | r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL, | ||
85 | 0, NULL, NULL); | ||
86 | if (!r) | ||
87 | goto retry; | ||
88 | |||
89 | DMERR("Error trying to resume userspace log: %d", r); | ||
90 | |||
91 | return -ESRCH; | ||
92 | } | ||
93 | |||
94 | static int build_constructor_string(struct dm_target *ti, | ||
95 | unsigned argc, char **argv, | ||
96 | char **ctr_str) | ||
97 | { | ||
98 | int i, str_size; | ||
99 | char *str = NULL; | ||
100 | |||
101 | *ctr_str = NULL; | ||
102 | |||
103 | for (i = 0, str_size = 0; i < argc; i++) | ||
104 | str_size += strlen(argv[i]) + 1; /* +1 for space between args */ | ||
105 | |||
106 | str_size += 20; /* Max number of chars in a printed u64 number */ | ||
107 | |||
108 | str = kzalloc(str_size, GFP_KERNEL); | ||
109 | if (!str) { | ||
110 | DMWARN("Unable to allocate memory for constructor string"); | ||
111 | return -ENOMEM; | ||
112 | } | ||
113 | |||
114 | for (i = 0, str_size = 0; i < argc; i++) | ||
115 | str_size += sprintf(str + str_size, "%s ", argv[i]); | ||
116 | str_size += sprintf(str + str_size, "%llu", | ||
117 | (unsigned long long)ti->len); | ||
118 | |||
119 | *ctr_str = str; | ||
120 | return str_size; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * userspace_ctr | ||
125 | * | ||
126 | * argv contains: | ||
127 | * <UUID> <other args> | ||
128 | * Where 'other args' is the userspace implementation specific log | ||
129 | * arguments. An example might be: | ||
130 | * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync] | ||
131 | * | ||
132 | * So, this module will strip off the <UUID> for identification purposes | ||
133 | * when communicating with userspace about a log; but will pass on everything | ||
134 | * else. | ||
135 | */ | ||
136 | static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | ||
137 | unsigned argc, char **argv) | ||
138 | { | ||
139 | int r = 0; | ||
140 | int str_size; | ||
141 | char *ctr_str = NULL; | ||
142 | struct log_c *lc = NULL; | ||
143 | uint64_t rdata; | ||
144 | size_t rdata_size = sizeof(rdata); | ||
145 | |||
146 | if (argc < 3) { | ||
147 | DMWARN("Too few arguments to userspace dirty log"); | ||
148 | return -EINVAL; | ||
149 | } | ||
150 | |||
151 | lc = kmalloc(sizeof(*lc), GFP_KERNEL); | ||
152 | if (!lc) { | ||
153 | DMWARN("Unable to allocate userspace log context."); | ||
154 | return -ENOMEM; | ||
155 | } | ||
156 | |||
157 | lc->ti = ti; | ||
158 | |||
159 | if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { | ||
160 | DMWARN("UUID argument too long."); | ||
161 | kfree(lc); | ||
162 | return -EINVAL; | ||
163 | } | ||
164 | |||
165 | strncpy(lc->uuid, argv[0], DM_UUID_LEN); | ||
166 | spin_lock_init(&lc->flush_lock); | ||
167 | INIT_LIST_HEAD(&lc->flush_list); | ||
168 | |||
169 | str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); | ||
170 | if (str_size < 0) { | ||
171 | kfree(lc); | ||
172 | return str_size; | ||
173 | } | ||
174 | |||
175 | /* Send table string */ | ||
176 | r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR, | ||
177 | ctr_str, str_size, NULL, NULL); | ||
178 | |||
179 | if (r == -ESRCH) { | ||
180 | DMERR("Userspace log server not found"); | ||
181 | goto out; | ||
182 | } | ||
183 | |||
184 | /* Since the region size does not change, get it now */ | ||
185 | rdata_size = sizeof(rdata); | ||
186 | r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE, | ||
187 | NULL, 0, (char *)&rdata, &rdata_size); | ||
188 | |||
189 | if (r) { | ||
190 | DMERR("Failed to get region size of dirty log"); | ||
191 | goto out; | ||
192 | } | ||
193 | |||
194 | lc->region_size = (uint32_t)rdata; | ||
195 | lc->region_count = dm_sector_div_up(ti->len, lc->region_size); | ||
196 | |||
197 | out: | ||
198 | if (r) { | ||
199 | kfree(lc); | ||
200 | kfree(ctr_str); | ||
201 | } else { | ||
202 | lc->usr_argv_str = ctr_str; | ||
203 | lc->usr_argc = argc; | ||
204 | log->context = lc; | ||
205 | } | ||
206 | |||
207 | return r; | ||
208 | } | ||
209 | |||
210 | static void userspace_dtr(struct dm_dirty_log *log) | ||
211 | { | ||
212 | int r; | ||
213 | struct log_c *lc = log->context; | ||
214 | |||
215 | r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR, | ||
216 | NULL, 0, | ||
217 | NULL, NULL); | ||
218 | |||
219 | kfree(lc->usr_argv_str); | ||
220 | kfree(lc); | ||
221 | |||
222 | return; | ||
223 | } | ||
224 | |||
225 | static int userspace_presuspend(struct dm_dirty_log *log) | ||
226 | { | ||
227 | int r; | ||
228 | struct log_c *lc = log->context; | ||
229 | |||
230 | r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND, | ||
231 | NULL, 0, | ||
232 | NULL, NULL); | ||
233 | |||
234 | return r; | ||
235 | } | ||
236 | |||
237 | static int userspace_postsuspend(struct dm_dirty_log *log) | ||
238 | { | ||
239 | int r; | ||
240 | struct log_c *lc = log->context; | ||
241 | |||
242 | r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND, | ||
243 | NULL, 0, | ||
244 | NULL, NULL); | ||
245 | |||
246 | return r; | ||
247 | } | ||
248 | |||
249 | static int userspace_resume(struct dm_dirty_log *log) | ||
250 | { | ||
251 | int r; | ||
252 | struct log_c *lc = log->context; | ||
253 | |||
254 | lc->in_sync_hint = 0; | ||
255 | r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME, | ||
256 | NULL, 0, | ||
257 | NULL, NULL); | ||
258 | |||
259 | return r; | ||
260 | } | ||
261 | |||
262 | static uint32_t userspace_get_region_size(struct dm_dirty_log *log) | ||
263 | { | ||
264 | struct log_c *lc = log->context; | ||
265 | |||
266 | return lc->region_size; | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * userspace_is_clean | ||
271 | * | ||
272 | * Check whether a region is clean. If there is any sort of | ||
273 | * failure when consulting the server, we return not clean. | ||
274 | * | ||
275 | * Returns: 1 if clean, 0 otherwise | ||
276 | */ | ||
277 | static int userspace_is_clean(struct dm_dirty_log *log, region_t region) | ||
278 | { | ||
279 | int r; | ||
280 | uint64_t region64 = (uint64_t)region; | ||
281 | int64_t is_clean; | ||
282 | size_t rdata_size; | ||
283 | struct log_c *lc = log->context; | ||
284 | |||
285 | rdata_size = sizeof(is_clean); | ||
286 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, | ||
287 | (char *)®ion64, sizeof(region64), | ||
288 | (char *)&is_clean, &rdata_size); | ||
289 | |||
290 | return (r) ? 0 : (int)is_clean; | ||
291 | } | ||
292 | |||
293 | /* | ||
294 | * userspace_in_sync | ||
295 | * | ||
296 | * Check if the region is in-sync. If there is any sort | ||
297 | * of failure when consulting the server, we assume that | ||
298 | * the region is not in sync. | ||
299 | * | ||
300 | * If 'can_block' is set, return immediately | ||
301 | * | ||
302 | * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK | ||
303 | */ | ||
304 | static int userspace_in_sync(struct dm_dirty_log *log, region_t region, | ||
305 | int can_block) | ||
306 | { | ||
307 | int r; | ||
308 | uint64_t region64 = region; | ||
309 | int64_t in_sync; | ||
310 | size_t rdata_size; | ||
311 | struct log_c *lc = log->context; | ||
312 | |||
313 | /* | ||
314 | * We can never respond directly - even if in_sync_hint is | ||
315 | * set. This is because another machine could see a device | ||
316 | * failure and mark the region out-of-sync. If we don't go | ||
317 | * to userspace to ask, we might think the region is in-sync | ||
318 | * and allow a read to pick up data that is stale. (This is | ||
319 | * very unlikely if a device actually fails; but it is very | ||
320 | * likely if a connection to one device from one machine fails.) | ||
321 | * | ||
322 | * There still might be a problem if the mirror caches the region | ||
323 | * state as in-sync... but then this call would not be made. So, | ||
324 | * that is a mirror problem. | ||
325 | */ | ||
326 | if (!can_block) | ||
327 | return -EWOULDBLOCK; | ||
328 | |||
329 | rdata_size = sizeof(in_sync); | ||
330 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, | ||
331 | (char *)®ion64, sizeof(region64), | ||
332 | (char *)&in_sync, &rdata_size); | ||
333 | return (r) ? 0 : (int)in_sync; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * userspace_flush | ||
338 | * | ||
339 | * This function is ok to block. | ||
340 | * The flush happens in two stages. First, it sends all | ||
341 | * clear/mark requests that are on the list. Then it | ||
342 | * tells the server to commit them. This gives the | ||
343 | * server a chance to optimise the commit, instead of | ||
344 | * doing it for every request. | ||
345 | * | ||
346 | * Additionally, we could implement another thread that | ||
347 | * sends the requests up to the server - reducing the | ||
348 | * load on flush. Then the flush would have less in | ||
349 | * the list and be responsible for the finishing commit. | ||
350 | * | ||
351 | * Returns: 0 on success, < 0 on failure | ||
352 | */ | ||
353 | static int userspace_flush(struct dm_dirty_log *log) | ||
354 | { | ||
355 | int r = 0; | ||
356 | unsigned long flags; | ||
357 | struct log_c *lc = log->context; | ||
358 | LIST_HEAD(flush_list); | ||
359 | struct flush_entry *fe, *tmp_fe; | ||
360 | |||
361 | spin_lock_irqsave(&lc->flush_lock, flags); | ||
362 | list_splice_init(&lc->flush_list, &flush_list); | ||
363 | spin_unlock_irqrestore(&lc->flush_lock, flags); | ||
364 | |||
365 | if (list_empty(&flush_list)) | ||
366 | return 0; | ||
367 | |||
368 | /* | ||
369 | * FIXME: Count up requests, group request types, | ||
370 | * allocate memory to stick all requests in and | ||
371 | * send to server in one go. Failing the allocation, | ||
372 | * do it one by one. | ||
373 | */ | ||
374 | |||
375 | list_for_each_entry(fe, &flush_list, list) { | ||
376 | r = userspace_do_request(lc, lc->uuid, fe->type, | ||
377 | (char *)&fe->region, | ||
378 | sizeof(fe->region), | ||
379 | NULL, NULL); | ||
380 | if (r) | ||
381 | goto fail; | ||
382 | } | ||
383 | |||
384 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, | ||
385 | NULL, 0, NULL, NULL); | ||
386 | |||
387 | fail: | ||
388 | /* | ||
389 | * We can safely remove these entries, even if failure. | ||
390 | * Calling code will receive an error and will know that | ||
391 | * the log facility has failed. | ||
392 | */ | ||
393 | list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { | ||
394 | list_del(&fe->list); | ||
395 | mempool_free(fe, flush_entry_pool); | ||
396 | } | ||
397 | |||
398 | if (r) | ||
399 | dm_table_event(lc->ti->table); | ||
400 | |||
401 | return r; | ||
402 | } | ||
403 | |||
404 | /* | ||
405 | * userspace_mark_region | ||
406 | * | ||
407 | * This function should avoid blocking unless absolutely required. | ||
408 | * (Memory allocation is valid for blocking.) | ||
409 | */ | ||
410 | static void userspace_mark_region(struct dm_dirty_log *log, region_t region) | ||
411 | { | ||
412 | unsigned long flags; | ||
413 | struct log_c *lc = log->context; | ||
414 | struct flush_entry *fe; | ||
415 | |||
416 | /* Wait for an allocation, but _never_ fail */ | ||
417 | fe = mempool_alloc(flush_entry_pool, GFP_NOIO); | ||
418 | BUG_ON(!fe); | ||
419 | |||
420 | spin_lock_irqsave(&lc->flush_lock, flags); | ||
421 | fe->type = DM_ULOG_MARK_REGION; | ||
422 | fe->region = region; | ||
423 | list_add(&fe->list, &lc->flush_list); | ||
424 | spin_unlock_irqrestore(&lc->flush_lock, flags); | ||
425 | |||
426 | return; | ||
427 | } | ||
428 | |||
429 | /* | ||
430 | * userspace_clear_region | ||
431 | * | ||
432 | * This function must not block. | ||
433 | * So, the alloc can't block. In the worst case, it is ok to | ||
434 | * fail. It would simply mean we can't clear the region. | ||
435 | * Does nothing to current sync context, but does mean | ||
436 | * the region will be re-sync'ed on a reload of the mirror | ||
437 | * even though it is in-sync. | ||
438 | */ | ||
439 | static void userspace_clear_region(struct dm_dirty_log *log, region_t region) | ||
440 | { | ||
441 | unsigned long flags; | ||
442 | struct log_c *lc = log->context; | ||
443 | struct flush_entry *fe; | ||
444 | |||
445 | /* | ||
446 | * If we fail to allocate, we skip the clearing of | ||
447 | * the region. This doesn't hurt us in any way, except | ||
448 | * to cause the region to be resync'ed when the | ||
449 | * device is activated next time. | ||
450 | */ | ||
451 | fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); | ||
452 | if (!fe) { | ||
453 | DMERR("Failed to allocate memory to clear region."); | ||
454 | return; | ||
455 | } | ||
456 | |||
457 | spin_lock_irqsave(&lc->flush_lock, flags); | ||
458 | fe->type = DM_ULOG_CLEAR_REGION; | ||
459 | fe->region = region; | ||
460 | list_add(&fe->list, &lc->flush_list); | ||
461 | spin_unlock_irqrestore(&lc->flush_lock, flags); | ||
462 | |||
463 | return; | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * userspace_get_resync_work | ||
468 | * | ||
469 | * Get a region that needs recovery. It is valid to return | ||
470 | * an error for this function. | ||
471 | * | ||
472 | * Returns: 1 if region filled, 0 if no work, <0 on error | ||
473 | */ | ||
474 | static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) | ||
475 | { | ||
476 | int r; | ||
477 | size_t rdata_size; | ||
478 | struct log_c *lc = log->context; | ||
479 | struct { | ||
480 | int64_t i; /* 64-bit for mix arch compatibility */ | ||
481 | region_t r; | ||
482 | } pkg; | ||
483 | |||
484 | if (lc->in_sync_hint >= lc->region_count) | ||
485 | return 0; | ||
486 | |||
487 | rdata_size = sizeof(pkg); | ||
488 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, | ||
489 | NULL, 0, | ||
490 | (char *)&pkg, &rdata_size); | ||
491 | |||
492 | *region = pkg.r; | ||
493 | return (r) ? r : (int)pkg.i; | ||
494 | } | ||
495 | |||
496 | /* | ||
497 | * userspace_set_region_sync | ||
498 | * | ||
499 | * Set the sync status of a given region. This function | ||
500 | * must not fail. | ||
501 | */ | ||
502 | static void userspace_set_region_sync(struct dm_dirty_log *log, | ||
503 | region_t region, int in_sync) | ||
504 | { | ||
505 | int r; | ||
506 | struct log_c *lc = log->context; | ||
507 | struct { | ||
508 | region_t r; | ||
509 | int64_t i; | ||
510 | } pkg; | ||
511 | |||
512 | pkg.r = region; | ||
513 | pkg.i = (int64_t)in_sync; | ||
514 | |||
515 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, | ||
516 | (char *)&pkg, sizeof(pkg), | ||
517 | NULL, NULL); | ||
518 | |||
519 | /* | ||
520 | * It would be nice to be able to report failures. | ||
521 | * However, it is easy emough to detect and resolve. | ||
522 | */ | ||
523 | return; | ||
524 | } | ||
525 | |||
526 | /* | ||
527 | * userspace_get_sync_count | ||
528 | * | ||
529 | * If there is any sort of failure when consulting the server, | ||
530 | * we assume that the sync count is zero. | ||
531 | * | ||
532 | * Returns: sync count on success, 0 on failure | ||
533 | */ | ||
534 | static region_t userspace_get_sync_count(struct dm_dirty_log *log) | ||
535 | { | ||
536 | int r; | ||
537 | size_t rdata_size; | ||
538 | uint64_t sync_count; | ||
539 | struct log_c *lc = log->context; | ||
540 | |||
541 | rdata_size = sizeof(sync_count); | ||
542 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, | ||
543 | NULL, 0, | ||
544 | (char *)&sync_count, &rdata_size); | ||
545 | |||
546 | if (r) | ||
547 | return 0; | ||
548 | |||
549 | if (sync_count >= lc->region_count) | ||
550 | lc->in_sync_hint = lc->region_count; | ||
551 | |||
552 | return (region_t)sync_count; | ||
553 | } | ||
554 | |||
555 | /* | ||
556 | * userspace_status | ||
557 | * | ||
558 | * Returns: amount of space consumed | ||
559 | */ | ||
560 | static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, | ||
561 | char *result, unsigned maxlen) | ||
562 | { | ||
563 | int r = 0; | ||
564 | size_t sz = (size_t)maxlen; | ||
565 | struct log_c *lc = log->context; | ||
566 | |||
567 | switch (status_type) { | ||
568 | case STATUSTYPE_INFO: | ||
569 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, | ||
570 | NULL, 0, | ||
571 | result, &sz); | ||
572 | |||
573 | if (r) { | ||
574 | sz = 0; | ||
575 | DMEMIT("%s 1 COM_FAILURE", log->type->name); | ||
576 | } | ||
577 | break; | ||
578 | case STATUSTYPE_TABLE: | ||
579 | sz = 0; | ||
580 | DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1, | ||
581 | lc->uuid, lc->usr_argv_str); | ||
582 | break; | ||
583 | } | ||
584 | return (r) ? 0 : (int)sz; | ||
585 | } | ||
586 | |||
587 | /* | ||
588 | * userspace_is_remote_recovering | ||
589 | * | ||
590 | * Returns: 1 if region recovering, 0 otherwise | ||
591 | */ | ||
592 | static int userspace_is_remote_recovering(struct dm_dirty_log *log, | ||
593 | region_t region) | ||
594 | { | ||
595 | int r; | ||
596 | uint64_t region64 = region; | ||
597 | struct log_c *lc = log->context; | ||
598 | static unsigned long long limit; | ||
599 | struct { | ||
600 | int64_t is_recovering; | ||
601 | uint64_t in_sync_hint; | ||
602 | } pkg; | ||
603 | size_t rdata_size = sizeof(pkg); | ||
604 | |||
605 | /* | ||
606 | * Once the mirror has been reported to be in-sync, | ||
607 | * it will never again ask for recovery work. So, | ||
608 | * we can safely say there is not a remote machine | ||
609 | * recovering if the device is in-sync. (in_sync_hint | ||
610 | * must be reset at resume time.) | ||
611 | */ | ||
612 | if (region < lc->in_sync_hint) | ||
613 | return 0; | ||
614 | else if (jiffies < limit) | ||
615 | return 1; | ||
616 | |||
617 | limit = jiffies + (HZ / 4); | ||
618 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, | ||
619 | (char *)®ion64, sizeof(region64), | ||
620 | (char *)&pkg, &rdata_size); | ||
621 | if (r) | ||
622 | return 1; | ||
623 | |||
624 | lc->in_sync_hint = pkg.in_sync_hint; | ||
625 | |||
626 | return (int)pkg.is_recovering; | ||
627 | } | ||
628 | |||
629 | static struct dm_dirty_log_type _userspace_type = { | ||
630 | .name = "userspace", | ||
631 | .module = THIS_MODULE, | ||
632 | .ctr = userspace_ctr, | ||
633 | .dtr = userspace_dtr, | ||
634 | .presuspend = userspace_presuspend, | ||
635 | .postsuspend = userspace_postsuspend, | ||
636 | .resume = userspace_resume, | ||
637 | .get_region_size = userspace_get_region_size, | ||
638 | .is_clean = userspace_is_clean, | ||
639 | .in_sync = userspace_in_sync, | ||
640 | .flush = userspace_flush, | ||
641 | .mark_region = userspace_mark_region, | ||
642 | .clear_region = userspace_clear_region, | ||
643 | .get_resync_work = userspace_get_resync_work, | ||
644 | .set_region_sync = userspace_set_region_sync, | ||
645 | .get_sync_count = userspace_get_sync_count, | ||
646 | .status = userspace_status, | ||
647 | .is_remote_recovering = userspace_is_remote_recovering, | ||
648 | }; | ||
649 | |||
650 | static int __init userspace_dirty_log_init(void) | ||
651 | { | ||
652 | int r = 0; | ||
653 | |||
654 | flush_entry_pool = mempool_create(100, flush_entry_alloc, | ||
655 | flush_entry_free, NULL); | ||
656 | |||
657 | if (!flush_entry_pool) { | ||
658 | DMWARN("Unable to create flush_entry_pool: No memory."); | ||
659 | return -ENOMEM; | ||
660 | } | ||
661 | |||
662 | r = dm_ulog_tfr_init(); | ||
663 | if (r) { | ||
664 | DMWARN("Unable to initialize userspace log communications"); | ||
665 | mempool_destroy(flush_entry_pool); | ||
666 | return r; | ||
667 | } | ||
668 | |||
669 | r = dm_dirty_log_type_register(&_userspace_type); | ||
670 | if (r) { | ||
671 | DMWARN("Couldn't register userspace dirty log type"); | ||
672 | dm_ulog_tfr_exit(); | ||
673 | mempool_destroy(flush_entry_pool); | ||
674 | return r; | ||
675 | } | ||
676 | |||
677 | DMINFO("version 1.0.0 loaded"); | ||
678 | return 0; | ||
679 | } | ||
680 | |||
681 | static void __exit userspace_dirty_log_exit(void) | ||
682 | { | ||
683 | dm_dirty_log_type_unregister(&_userspace_type); | ||
684 | dm_ulog_tfr_exit(); | ||
685 | mempool_destroy(flush_entry_pool); | ||
686 | |||
687 | DMINFO("version 1.0.0 unloaded"); | ||
688 | return; | ||
689 | } | ||
690 | |||
691 | module_init(userspace_dirty_log_init); | ||
692 | module_exit(userspace_dirty_log_exit); | ||
693 | |||
694 | MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); | ||
695 | MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); | ||
696 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c new file mode 100644 index 000000000000..0ca1ee768a1f --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.c | |||
@@ -0,0 +1,276 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
3 | * | ||
4 | * This file is released under the LGPL. | ||
5 | */ | ||
6 | |||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/module.h> | ||
9 | #include <net/sock.h> | ||
10 | #include <linux/workqueue.h> | ||
11 | #include <linux/connector.h> | ||
12 | #include <linux/device-mapper.h> | ||
13 | #include <linux/dm-log-userspace.h> | ||
14 | |||
15 | #include "dm-log-userspace-transfer.h" | ||
16 | |||
17 | static uint32_t dm_ulog_seq; | ||
18 | |||
19 | /* | ||
20 | * Netlink/Connector is an unreliable protocol. How long should | ||
21 | * we wait for a response before assuming it was lost and retrying? | ||
22 | * (If we do receive a response after this time, it will be discarded | ||
23 | * and the response to the resent request will be waited for. | ||
24 | */ | ||
25 | #define DM_ULOG_RETRY_TIMEOUT (15 * HZ) | ||
26 | |||
27 | /* | ||
28 | * Pre-allocated space for speed | ||
29 | */ | ||
30 | #define DM_ULOG_PREALLOCED_SIZE 512 | ||
31 | static struct cn_msg *prealloced_cn_msg; | ||
32 | static struct dm_ulog_request *prealloced_ulog_tfr; | ||
33 | |||
34 | static struct cb_id ulog_cn_id = { | ||
35 | .idx = CN_IDX_DM, | ||
36 | .val = CN_VAL_DM_USERSPACE_LOG | ||
37 | }; | ||
38 | |||
39 | static DEFINE_MUTEX(dm_ulog_lock); | ||
40 | |||
41 | struct receiving_pkg { | ||
42 | struct list_head list; | ||
43 | struct completion complete; | ||
44 | |||
45 | uint32_t seq; | ||
46 | |||
47 | int error; | ||
48 | size_t *data_size; | ||
49 | char *data; | ||
50 | }; | ||
51 | |||
52 | static DEFINE_SPINLOCK(receiving_list_lock); | ||
53 | static struct list_head receiving_list; | ||
54 | |||
55 | static int dm_ulog_sendto_server(struct dm_ulog_request *tfr) | ||
56 | { | ||
57 | int r; | ||
58 | struct cn_msg *msg = prealloced_cn_msg; | ||
59 | |||
60 | memset(msg, 0, sizeof(struct cn_msg)); | ||
61 | |||
62 | msg->id.idx = ulog_cn_id.idx; | ||
63 | msg->id.val = ulog_cn_id.val; | ||
64 | msg->ack = 0; | ||
65 | msg->seq = tfr->seq; | ||
66 | msg->len = sizeof(struct dm_ulog_request) + tfr->data_size; | ||
67 | |||
68 | r = cn_netlink_send(msg, 0, gfp_any()); | ||
69 | |||
70 | return r; | ||
71 | } | ||
72 | |||
73 | /* | ||
74 | * Parameters for this function can be either msg or tfr, but not | ||
75 | * both. This function fills in the reply for a waiting request. | ||
76 | * If just msg is given, then the reply is simply an ACK from userspace | ||
77 | * that the request was received. | ||
78 | * | ||
79 | * Returns: 0 on success, -ENOENT on failure | ||
80 | */ | ||
81 | static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr) | ||
82 | { | ||
83 | uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0; | ||
84 | struct receiving_pkg *pkg; | ||
85 | |||
86 | /* | ||
87 | * The 'receiving_pkg' entries in this list are statically | ||
88 | * allocated on the stack in 'dm_consult_userspace'. | ||
89 | * Each process that is waiting for a reply from the user | ||
90 | * space server will have an entry in this list. | ||
91 | * | ||
92 | * We are safe to do it this way because the stack space | ||
93 | * is unique to each process, but still addressable by | ||
94 | * other processes. | ||
95 | */ | ||
96 | list_for_each_entry(pkg, &receiving_list, list) { | ||
97 | if (rtn_seq != pkg->seq) | ||
98 | continue; | ||
99 | |||
100 | if (msg) { | ||
101 | pkg->error = -msg->ack; | ||
102 | /* | ||
103 | * If we are trying again, we will need to know our | ||
104 | * storage capacity. Otherwise, along with the | ||
105 | * error code, we make explicit that we have no data. | ||
106 | */ | ||
107 | if (pkg->error != -EAGAIN) | ||
108 | *(pkg->data_size) = 0; | ||
109 | } else if (tfr->data_size > *(pkg->data_size)) { | ||
110 | DMERR("Insufficient space to receive package [%u] " | ||
111 | "(%u vs %lu)", tfr->request_type, | ||
112 | tfr->data_size, *(pkg->data_size)); | ||
113 | |||
114 | *(pkg->data_size) = 0; | ||
115 | pkg->error = -ENOSPC; | ||
116 | } else { | ||
117 | pkg->error = tfr->error; | ||
118 | memcpy(pkg->data, tfr->data, tfr->data_size); | ||
119 | *(pkg->data_size) = tfr->data_size; | ||
120 | } | ||
121 | complete(&pkg->complete); | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | return -ENOENT; | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * This is the connector callback that delivers data | ||
130 | * that was sent from userspace. | ||
131 | */ | ||
132 | static void cn_ulog_callback(void *data) | ||
133 | { | ||
134 | struct cn_msg *msg = (struct cn_msg *)data; | ||
135 | struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); | ||
136 | |||
137 | spin_lock(&receiving_list_lock); | ||
138 | if (msg->len == 0) | ||
139 | fill_pkg(msg, NULL); | ||
140 | else if (msg->len < sizeof(*tfr)) | ||
141 | DMERR("Incomplete message received (expected %u, got %u): [%u]", | ||
142 | (unsigned)sizeof(*tfr), msg->len, msg->seq); | ||
143 | else | ||
144 | fill_pkg(NULL, tfr); | ||
145 | spin_unlock(&receiving_list_lock); | ||
146 | } | ||
147 | |||
148 | /** | ||
149 | * dm_consult_userspace | ||
150 | * @uuid: log's uuid (must be DM_UUID_LEN in size) | ||
151 | * @request_type: found in include/linux/dm-log-userspace.h | ||
152 | * @data: data to tx to the server | ||
153 | * @data_size: size of data in bytes | ||
154 | * @rdata: place to put return data from server | ||
155 | * @rdata_size: value-result (amount of space given/amount of space used) | ||
156 | * | ||
157 | * rdata_size is undefined on failure. | ||
158 | * | ||
159 | * Memory used to communicate with userspace is zero'ed | ||
160 | * before populating to ensure that no unwanted bits leak | ||
161 | * from kernel space to user-space. All userspace log communications | ||
162 | * between kernel and user space go through this function. | ||
163 | * | ||
164 | * Returns: 0 on success, -EXXX on failure | ||
165 | **/ | ||
166 | int dm_consult_userspace(const char *uuid, int request_type, | ||
167 | char *data, size_t data_size, | ||
168 | char *rdata, size_t *rdata_size) | ||
169 | { | ||
170 | int r = 0; | ||
171 | size_t dummy = 0; | ||
172 | int overhead_size = | ||
173 | sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg); | ||
174 | struct dm_ulog_request *tfr = prealloced_ulog_tfr; | ||
175 | struct receiving_pkg pkg; | ||
176 | |||
177 | if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { | ||
178 | DMINFO("Size of tfr exceeds preallocated size"); | ||
179 | return -EINVAL; | ||
180 | } | ||
181 | |||
182 | if (!rdata_size) | ||
183 | rdata_size = &dummy; | ||
184 | resend: | ||
185 | /* | ||
186 | * We serialize the sending of requests so we can | ||
187 | * use the preallocated space. | ||
188 | */ | ||
189 | mutex_lock(&dm_ulog_lock); | ||
190 | |||
191 | memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); | ||
192 | memcpy(tfr->uuid, uuid, DM_UUID_LEN); | ||
193 | tfr->seq = dm_ulog_seq++; | ||
194 | |||
195 | /* | ||
196 | * Must be valid request type (all other bits set to | ||
197 | * zero). This reserves other bits for possible future | ||
198 | * use. | ||
199 | */ | ||
200 | tfr->request_type = request_type & DM_ULOG_REQUEST_MASK; | ||
201 | |||
202 | tfr->data_size = data_size; | ||
203 | if (data && data_size) | ||
204 | memcpy(tfr->data, data, data_size); | ||
205 | |||
206 | memset(&pkg, 0, sizeof(pkg)); | ||
207 | init_completion(&pkg.complete); | ||
208 | pkg.seq = tfr->seq; | ||
209 | pkg.data_size = rdata_size; | ||
210 | pkg.data = rdata; | ||
211 | spin_lock(&receiving_list_lock); | ||
212 | list_add(&(pkg.list), &receiving_list); | ||
213 | spin_unlock(&receiving_list_lock); | ||
214 | |||
215 | r = dm_ulog_sendto_server(tfr); | ||
216 | |||
217 | mutex_unlock(&dm_ulog_lock); | ||
218 | |||
219 | if (r) { | ||
220 | DMERR("Unable to send log request [%u] to userspace: %d", | ||
221 | request_type, r); | ||
222 | spin_lock(&receiving_list_lock); | ||
223 | list_del_init(&(pkg.list)); | ||
224 | spin_unlock(&receiving_list_lock); | ||
225 | |||
226 | goto out; | ||
227 | } | ||
228 | |||
229 | r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT); | ||
230 | spin_lock(&receiving_list_lock); | ||
231 | list_del_init(&(pkg.list)); | ||
232 | spin_unlock(&receiving_list_lock); | ||
233 | if (!r) { | ||
234 | DMWARN("[%s] Request timed out: [%u/%u] - retrying", | ||
235 | (strlen(uuid) > 8) ? | ||
236 | (uuid + (strlen(uuid) - 8)) : (uuid), | ||
237 | request_type, pkg.seq); | ||
238 | goto resend; | ||
239 | } | ||
240 | |||
241 | r = pkg.error; | ||
242 | if (r == -EAGAIN) | ||
243 | goto resend; | ||
244 | |||
245 | out: | ||
246 | return r; | ||
247 | } | ||
248 | |||
249 | int dm_ulog_tfr_init(void) | ||
250 | { | ||
251 | int r; | ||
252 | void *prealloced; | ||
253 | |||
254 | INIT_LIST_HEAD(&receiving_list); | ||
255 | |||
256 | prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL); | ||
257 | if (!prealloced) | ||
258 | return -ENOMEM; | ||
259 | |||
260 | prealloced_cn_msg = prealloced; | ||
261 | prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg); | ||
262 | |||
263 | r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); | ||
264 | if (r) { | ||
265 | cn_del_callback(&ulog_cn_id); | ||
266 | return r; | ||
267 | } | ||
268 | |||
269 | return 0; | ||
270 | } | ||
271 | |||
272 | void dm_ulog_tfr_exit(void) | ||
273 | { | ||
274 | cn_del_callback(&ulog_cn_id); | ||
275 | kfree(prealloced_cn_msg); | ||
276 | } | ||
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h new file mode 100644 index 000000000000..c26d8e4e2710 --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.h | |||
@@ -0,0 +1,18 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
3 | * | ||
4 | * This file is released under the LGPL. | ||
5 | */ | ||
6 | |||
7 | #ifndef __DM_LOG_USERSPACE_TRANSFER_H__ | ||
8 | #define __DM_LOG_USERSPACE_TRANSFER_H__ | ||
9 | |||
10 | #define DM_MSG_PREFIX "dm-log-userspace" | ||
11 | |||
12 | int dm_ulog_tfr_init(void); | ||
13 | void dm_ulog_tfr_exit(void); | ||
14 | int dm_consult_userspace(const char *uuid, int request_type, | ||
15 | char *data, size_t data_size, | ||
16 | char *rdata, size_t *rdata_size); | ||
17 | |||
18 | #endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */ | ||
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index be233bc4d917..9443896ede07 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
@@ -412,10 +412,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
412 | /* | 412 | /* |
413 | * Buffer holds both header and bitset. | 413 | * Buffer holds both header and bitset. |
414 | */ | 414 | */ |
415 | buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + | 415 | buf_size = |
416 | bitset_size, ti->limits.hardsect_size); | 416 | dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size, |
417 | bdev_logical_block_size(lc->header_location. | ||
418 | bdev)); | ||
417 | 419 | ||
418 | if (buf_size > dev->bdev->bd_inode->i_size) { | 420 | if (buf_size > i_size_read(dev->bdev->bd_inode)) { |
419 | DMWARN("log device %s too small: need %llu bytes", | 421 | DMWARN("log device %s too small: need %llu bytes", |
420 | dev->name, (unsigned long long)buf_size); | 422 | dev->name, (unsigned long long)buf_size); |
421 | kfree(lc); | 423 | kfree(lc); |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6a386ab4f7eb..c70604a20897 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -8,7 +8,6 @@ | |||
8 | #include <linux/device-mapper.h> | 8 | #include <linux/device-mapper.h> |
9 | 9 | ||
10 | #include "dm-path-selector.h" | 10 | #include "dm-path-selector.h" |
11 | #include "dm-bio-record.h" | ||
12 | #include "dm-uevent.h" | 11 | #include "dm-uevent.h" |
13 | 12 | ||
14 | #include <linux/ctype.h> | 13 | #include <linux/ctype.h> |
@@ -35,6 +34,7 @@ struct pgpath { | |||
35 | 34 | ||
36 | struct dm_path path; | 35 | struct dm_path path; |
37 | struct work_struct deactivate_path; | 36 | struct work_struct deactivate_path; |
37 | struct work_struct activate_path; | ||
38 | }; | 38 | }; |
39 | 39 | ||
40 | #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) | 40 | #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) |
@@ -64,8 +64,6 @@ struct multipath { | |||
64 | spinlock_t lock; | 64 | spinlock_t lock; |
65 | 65 | ||
66 | const char *hw_handler_name; | 66 | const char *hw_handler_name; |
67 | struct work_struct activate_path; | ||
68 | struct pgpath *pgpath_to_activate; | ||
69 | unsigned nr_priority_groups; | 67 | unsigned nr_priority_groups; |
70 | struct list_head priority_groups; | 68 | struct list_head priority_groups; |
71 | unsigned pg_init_required; /* pg_init needs calling? */ | 69 | unsigned pg_init_required; /* pg_init needs calling? */ |
@@ -84,7 +82,7 @@ struct multipath { | |||
84 | unsigned pg_init_count; /* Number of times pg_init called */ | 82 | unsigned pg_init_count; /* Number of times pg_init called */ |
85 | 83 | ||
86 | struct work_struct process_queued_ios; | 84 | struct work_struct process_queued_ios; |
87 | struct bio_list queued_ios; | 85 | struct list_head queued_ios; |
88 | unsigned queue_size; | 86 | unsigned queue_size; |
89 | 87 | ||
90 | struct work_struct trigger_event; | 88 | struct work_struct trigger_event; |
@@ -101,7 +99,7 @@ struct multipath { | |||
101 | */ | 99 | */ |
102 | struct dm_mpath_io { | 100 | struct dm_mpath_io { |
103 | struct pgpath *pgpath; | 101 | struct pgpath *pgpath; |
104 | struct dm_bio_details details; | 102 | size_t nr_bytes; |
105 | }; | 103 | }; |
106 | 104 | ||
107 | typedef int (*action_fn) (struct pgpath *pgpath); | 105 | typedef int (*action_fn) (struct pgpath *pgpath); |
@@ -128,6 +126,7 @@ static struct pgpath *alloc_pgpath(void) | |||
128 | if (pgpath) { | 126 | if (pgpath) { |
129 | pgpath->is_active = 1; | 127 | pgpath->is_active = 1; |
130 | INIT_WORK(&pgpath->deactivate_path, deactivate_path); | 128 | INIT_WORK(&pgpath->deactivate_path, deactivate_path); |
129 | INIT_WORK(&pgpath->activate_path, activate_path); | ||
131 | } | 130 | } |
132 | 131 | ||
133 | return pgpath; | 132 | return pgpath; |
@@ -160,7 +159,6 @@ static struct priority_group *alloc_priority_group(void) | |||
160 | 159 | ||
161 | static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) | 160 | static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) |
162 | { | 161 | { |
163 | unsigned long flags; | ||
164 | struct pgpath *pgpath, *tmp; | 162 | struct pgpath *pgpath, *tmp; |
165 | struct multipath *m = ti->private; | 163 | struct multipath *m = ti->private; |
166 | 164 | ||
@@ -169,10 +167,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) | |||
169 | if (m->hw_handler_name) | 167 | if (m->hw_handler_name) |
170 | scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); | 168 | scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); |
171 | dm_put_device(ti, pgpath->path.dev); | 169 | dm_put_device(ti, pgpath->path.dev); |
172 | spin_lock_irqsave(&m->lock, flags); | ||
173 | if (m->pgpath_to_activate == pgpath) | ||
174 | m->pgpath_to_activate = NULL; | ||
175 | spin_unlock_irqrestore(&m->lock, flags); | ||
176 | free_pgpath(pgpath); | 170 | free_pgpath(pgpath); |
177 | } | 171 | } |
178 | } | 172 | } |
@@ -198,11 +192,11 @@ static struct multipath *alloc_multipath(struct dm_target *ti) | |||
198 | m = kzalloc(sizeof(*m), GFP_KERNEL); | 192 | m = kzalloc(sizeof(*m), GFP_KERNEL); |
199 | if (m) { | 193 | if (m) { |
200 | INIT_LIST_HEAD(&m->priority_groups); | 194 | INIT_LIST_HEAD(&m->priority_groups); |
195 | INIT_LIST_HEAD(&m->queued_ios); | ||
201 | spin_lock_init(&m->lock); | 196 | spin_lock_init(&m->lock); |
202 | m->queue_io = 1; | 197 | m->queue_io = 1; |
203 | INIT_WORK(&m->process_queued_ios, process_queued_ios); | 198 | INIT_WORK(&m->process_queued_ios, process_queued_ios); |
204 | INIT_WORK(&m->trigger_event, trigger_event); | 199 | INIT_WORK(&m->trigger_event, trigger_event); |
205 | INIT_WORK(&m->activate_path, activate_path); | ||
206 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); | 200 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); |
207 | if (!m->mpio_pool) { | 201 | if (!m->mpio_pool) { |
208 | kfree(m); | 202 | kfree(m); |
@@ -250,11 +244,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) | |||
250 | m->pg_init_count = 0; | 244 | m->pg_init_count = 0; |
251 | } | 245 | } |
252 | 246 | ||
253 | static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) | 247 | static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, |
248 | size_t nr_bytes) | ||
254 | { | 249 | { |
255 | struct dm_path *path; | 250 | struct dm_path *path; |
256 | 251 | ||
257 | path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); | 252 | path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); |
258 | if (!path) | 253 | if (!path) |
259 | return -ENXIO; | 254 | return -ENXIO; |
260 | 255 | ||
@@ -266,7 +261,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) | |||
266 | return 0; | 261 | return 0; |
267 | } | 262 | } |
268 | 263 | ||
269 | static void __choose_pgpath(struct multipath *m) | 264 | static void __choose_pgpath(struct multipath *m, size_t nr_bytes) |
270 | { | 265 | { |
271 | struct priority_group *pg; | 266 | struct priority_group *pg; |
272 | unsigned bypassed = 1; | 267 | unsigned bypassed = 1; |
@@ -278,12 +273,12 @@ static void __choose_pgpath(struct multipath *m) | |||
278 | if (m->next_pg) { | 273 | if (m->next_pg) { |
279 | pg = m->next_pg; | 274 | pg = m->next_pg; |
280 | m->next_pg = NULL; | 275 | m->next_pg = NULL; |
281 | if (!__choose_path_in_pg(m, pg)) | 276 | if (!__choose_path_in_pg(m, pg, nr_bytes)) |
282 | return; | 277 | return; |
283 | } | 278 | } |
284 | 279 | ||
285 | /* Don't change PG until it has no remaining paths */ | 280 | /* Don't change PG until it has no remaining paths */ |
286 | if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) | 281 | if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) |
287 | return; | 282 | return; |
288 | 283 | ||
289 | /* | 284 | /* |
@@ -295,7 +290,7 @@ static void __choose_pgpath(struct multipath *m) | |||
295 | list_for_each_entry(pg, &m->priority_groups, list) { | 290 | list_for_each_entry(pg, &m->priority_groups, list) { |
296 | if (pg->bypassed == bypassed) | 291 | if (pg->bypassed == bypassed) |
297 | continue; | 292 | continue; |
298 | if (!__choose_path_in_pg(m, pg)) | 293 | if (!__choose_path_in_pg(m, pg, nr_bytes)) |
299 | return; | 294 | return; |
300 | } | 295 | } |
301 | } while (bypassed--); | 296 | } while (bypassed--); |
@@ -322,19 +317,21 @@ static int __must_push_back(struct multipath *m) | |||
322 | dm_noflush_suspending(m->ti)); | 317 | dm_noflush_suspending(m->ti)); |
323 | } | 318 | } |
324 | 319 | ||
325 | static int map_io(struct multipath *m, struct bio *bio, | 320 | static int map_io(struct multipath *m, struct request *clone, |
326 | struct dm_mpath_io *mpio, unsigned was_queued) | 321 | struct dm_mpath_io *mpio, unsigned was_queued) |
327 | { | 322 | { |
328 | int r = DM_MAPIO_REMAPPED; | 323 | int r = DM_MAPIO_REMAPPED; |
324 | size_t nr_bytes = blk_rq_bytes(clone); | ||
329 | unsigned long flags; | 325 | unsigned long flags; |
330 | struct pgpath *pgpath; | 326 | struct pgpath *pgpath; |
327 | struct block_device *bdev; | ||
331 | 328 | ||
332 | spin_lock_irqsave(&m->lock, flags); | 329 | spin_lock_irqsave(&m->lock, flags); |
333 | 330 | ||
334 | /* Do we need to select a new pgpath? */ | 331 | /* Do we need to select a new pgpath? */ |
335 | if (!m->current_pgpath || | 332 | if (!m->current_pgpath || |
336 | (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) | 333 | (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) |
337 | __choose_pgpath(m); | 334 | __choose_pgpath(m, nr_bytes); |
338 | 335 | ||
339 | pgpath = m->current_pgpath; | 336 | pgpath = m->current_pgpath; |
340 | 337 | ||
@@ -344,21 +341,28 @@ static int map_io(struct multipath *m, struct bio *bio, | |||
344 | if ((pgpath && m->queue_io) || | 341 | if ((pgpath && m->queue_io) || |
345 | (!pgpath && m->queue_if_no_path)) { | 342 | (!pgpath && m->queue_if_no_path)) { |
346 | /* Queue for the daemon to resubmit */ | 343 | /* Queue for the daemon to resubmit */ |
347 | bio_list_add(&m->queued_ios, bio); | 344 | list_add_tail(&clone->queuelist, &m->queued_ios); |
348 | m->queue_size++; | 345 | m->queue_size++; |
349 | if ((m->pg_init_required && !m->pg_init_in_progress) || | 346 | if ((m->pg_init_required && !m->pg_init_in_progress) || |
350 | !m->queue_io) | 347 | !m->queue_io) |
351 | queue_work(kmultipathd, &m->process_queued_ios); | 348 | queue_work(kmultipathd, &m->process_queued_ios); |
352 | pgpath = NULL; | 349 | pgpath = NULL; |
353 | r = DM_MAPIO_SUBMITTED; | 350 | r = DM_MAPIO_SUBMITTED; |
354 | } else if (pgpath) | 351 | } else if (pgpath) { |
355 | bio->bi_bdev = pgpath->path.dev->bdev; | 352 | bdev = pgpath->path.dev->bdev; |
356 | else if (__must_push_back(m)) | 353 | clone->q = bdev_get_queue(bdev); |
354 | clone->rq_disk = bdev->bd_disk; | ||
355 | } else if (__must_push_back(m)) | ||
357 | r = DM_MAPIO_REQUEUE; | 356 | r = DM_MAPIO_REQUEUE; |
358 | else | 357 | else |
359 | r = -EIO; /* Failed */ | 358 | r = -EIO; /* Failed */ |
360 | 359 | ||
361 | mpio->pgpath = pgpath; | 360 | mpio->pgpath = pgpath; |
361 | mpio->nr_bytes = nr_bytes; | ||
362 | |||
363 | if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io) | ||
364 | pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, | ||
365 | nr_bytes); | ||
362 | 366 | ||
363 | spin_unlock_irqrestore(&m->lock, flags); | 367 | spin_unlock_irqrestore(&m->lock, flags); |
364 | 368 | ||
@@ -396,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m) | |||
396 | { | 400 | { |
397 | int r; | 401 | int r; |
398 | unsigned long flags; | 402 | unsigned long flags; |
399 | struct bio *bio = NULL, *next; | ||
400 | struct dm_mpath_io *mpio; | 403 | struct dm_mpath_io *mpio; |
401 | union map_info *info; | 404 | union map_info *info; |
405 | struct request *clone, *n; | ||
406 | LIST_HEAD(cl); | ||
402 | 407 | ||
403 | spin_lock_irqsave(&m->lock, flags); | 408 | spin_lock_irqsave(&m->lock, flags); |
404 | bio = bio_list_get(&m->queued_ios); | 409 | list_splice_init(&m->queued_ios, &cl); |
405 | spin_unlock_irqrestore(&m->lock, flags); | 410 | spin_unlock_irqrestore(&m->lock, flags); |
406 | 411 | ||
407 | while (bio) { | 412 | list_for_each_entry_safe(clone, n, &cl, queuelist) { |
408 | next = bio->bi_next; | 413 | list_del_init(&clone->queuelist); |
409 | bio->bi_next = NULL; | ||
410 | 414 | ||
411 | info = dm_get_mapinfo(bio); | 415 | info = dm_get_rq_mapinfo(clone); |
412 | mpio = info->ptr; | 416 | mpio = info->ptr; |
413 | 417 | ||
414 | r = map_io(m, bio, mpio, 1); | 418 | r = map_io(m, clone, mpio, 1); |
415 | if (r < 0) | 419 | if (r < 0) { |
416 | bio_endio(bio, r); | 420 | mempool_free(mpio, m->mpio_pool); |
417 | else if (r == DM_MAPIO_REMAPPED) | 421 | dm_kill_unmapped_request(clone, r); |
418 | generic_make_request(bio); | 422 | } else if (r == DM_MAPIO_REMAPPED) |
419 | else if (r == DM_MAPIO_REQUEUE) | 423 | dm_dispatch_request(clone); |
420 | bio_endio(bio, -EIO); | 424 | else if (r == DM_MAPIO_REQUEUE) { |
421 | 425 | mempool_free(mpio, m->mpio_pool); | |
422 | bio = next; | 426 | dm_requeue_unmapped_request(clone); |
427 | } | ||
423 | } | 428 | } |
424 | } | 429 | } |
425 | 430 | ||
@@ -427,8 +432,8 @@ static void process_queued_ios(struct work_struct *work) | |||
427 | { | 432 | { |
428 | struct multipath *m = | 433 | struct multipath *m = |
429 | container_of(work, struct multipath, process_queued_ios); | 434 | container_of(work, struct multipath, process_queued_ios); |
430 | struct pgpath *pgpath = NULL; | 435 | struct pgpath *pgpath = NULL, *tmp; |
431 | unsigned init_required = 0, must_queue = 1; | 436 | unsigned must_queue = 1; |
432 | unsigned long flags; | 437 | unsigned long flags; |
433 | 438 | ||
434 | spin_lock_irqsave(&m->lock, flags); | 439 | spin_lock_irqsave(&m->lock, flags); |
@@ -437,7 +442,7 @@ static void process_queued_ios(struct work_struct *work) | |||
437 | goto out; | 442 | goto out; |
438 | 443 | ||
439 | if (!m->current_pgpath) | 444 | if (!m->current_pgpath) |
440 | __choose_pgpath(m); | 445 | __choose_pgpath(m, 0); |
441 | 446 | ||
442 | pgpath = m->current_pgpath; | 447 | pgpath = m->current_pgpath; |
443 | 448 | ||
@@ -446,19 +451,15 @@ static void process_queued_ios(struct work_struct *work) | |||
446 | must_queue = 0; | 451 | must_queue = 0; |
447 | 452 | ||
448 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { | 453 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { |
449 | m->pgpath_to_activate = pgpath; | ||
450 | m->pg_init_count++; | 454 | m->pg_init_count++; |
451 | m->pg_init_required = 0; | 455 | m->pg_init_required = 0; |
452 | m->pg_init_in_progress = 1; | 456 | list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) { |
453 | init_required = 1; | 457 | if (queue_work(kmpath_handlerd, &tmp->activate_path)) |
458 | m->pg_init_in_progress++; | ||
459 | } | ||
454 | } | 460 | } |
455 | |||
456 | out: | 461 | out: |
457 | spin_unlock_irqrestore(&m->lock, flags); | 462 | spin_unlock_irqrestore(&m->lock, flags); |
458 | |||
459 | if (init_required) | ||
460 | queue_work(kmpath_handlerd, &m->activate_path); | ||
461 | |||
462 | if (!must_queue) | 463 | if (!must_queue) |
463 | dispatch_queued_ios(m); | 464 | dispatch_queued_ios(m); |
464 | } | 465 | } |
@@ -553,6 +554,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, | |||
553 | return -EINVAL; | 554 | return -EINVAL; |
554 | } | 555 | } |
555 | 556 | ||
557 | if (ps_argc > as->argc) { | ||
558 | dm_put_path_selector(pst); | ||
559 | ti->error = "not enough arguments for path selector"; | ||
560 | return -EINVAL; | ||
561 | } | ||
562 | |||
556 | r = pst->create(&pg->ps, ps_argc, as->argv); | 563 | r = pst->create(&pg->ps, ps_argc, as->argv); |
557 | if (r) { | 564 | if (r) { |
558 | dm_put_path_selector(pst); | 565 | dm_put_path_selector(pst); |
@@ -591,9 +598,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, | |||
591 | } | 598 | } |
592 | 599 | ||
593 | if (m->hw_handler_name) { | 600 | if (m->hw_handler_name) { |
594 | r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev), | 601 | struct request_queue *q = bdev_get_queue(p->path.dev->bdev); |
595 | m->hw_handler_name); | 602 | |
603 | r = scsi_dh_attach(q, m->hw_handler_name); | ||
604 | if (r == -EBUSY) { | ||
605 | /* | ||
606 | * Already attached to different hw_handler, | ||
607 | * try to reattach with correct one. | ||
608 | */ | ||
609 | scsi_dh_detach(q); | ||
610 | r = scsi_dh_attach(q, m->hw_handler_name); | ||
611 | } | ||
612 | |||
596 | if (r < 0) { | 613 | if (r < 0) { |
614 | ti->error = "error attaching hardware handler"; | ||
597 | dm_put_device(ti, p->path.dev); | 615 | dm_put_device(ti, p->path.dev); |
598 | goto bad; | 616 | goto bad; |
599 | } | 617 | } |
@@ -699,6 +717,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) | |||
699 | if (!hw_argc) | 717 | if (!hw_argc) |
700 | return 0; | 718 | return 0; |
701 | 719 | ||
720 | if (hw_argc > as->argc) { | ||
721 | ti->error = "not enough arguments for hardware handler"; | ||
722 | return -EINVAL; | ||
723 | } | ||
724 | |||
702 | m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); | 725 | m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); |
703 | request_module("scsi_dh_%s", m->hw_handler_name); | 726 | request_module("scsi_dh_%s", m->hw_handler_name); |
704 | if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { | 727 | if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { |
@@ -823,6 +846,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, | |||
823 | goto bad; | 846 | goto bad; |
824 | } | 847 | } |
825 | 848 | ||
849 | ti->num_flush_requests = 1; | ||
850 | |||
826 | return 0; | 851 | return 0; |
827 | 852 | ||
828 | bad: | 853 | bad: |
@@ -836,25 +861,29 @@ static void multipath_dtr(struct dm_target *ti) | |||
836 | 861 | ||
837 | flush_workqueue(kmpath_handlerd); | 862 | flush_workqueue(kmpath_handlerd); |
838 | flush_workqueue(kmultipathd); | 863 | flush_workqueue(kmultipathd); |
864 | flush_scheduled_work(); | ||
839 | free_multipath(m); | 865 | free_multipath(m); |
840 | } | 866 | } |
841 | 867 | ||
842 | /* | 868 | /* |
843 | * Map bios, recording original fields for later in case we have to resubmit | 869 | * Map cloned requests |
844 | */ | 870 | */ |
845 | static int multipath_map(struct dm_target *ti, struct bio *bio, | 871 | static int multipath_map(struct dm_target *ti, struct request *clone, |
846 | union map_info *map_context) | 872 | union map_info *map_context) |
847 | { | 873 | { |
848 | int r; | 874 | int r; |
849 | struct dm_mpath_io *mpio; | 875 | struct dm_mpath_io *mpio; |
850 | struct multipath *m = (struct multipath *) ti->private; | 876 | struct multipath *m = (struct multipath *) ti->private; |
851 | 877 | ||
852 | mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); | 878 | mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); |
853 | dm_bio_record(&mpio->details, bio); | 879 | if (!mpio) |
880 | /* ENOMEM, requeue */ | ||
881 | return DM_MAPIO_REQUEUE; | ||
882 | memset(mpio, 0, sizeof(*mpio)); | ||
854 | 883 | ||
855 | map_context->ptr = mpio; | 884 | map_context->ptr = mpio; |
856 | bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); | 885 | clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; |
857 | r = map_io(m, bio, mpio, 0); | 886 | r = map_io(m, clone, mpio, 0); |
858 | if (r < 0 || r == DM_MAPIO_REQUEUE) | 887 | if (r < 0 || r == DM_MAPIO_REQUEUE) |
859 | mempool_free(mpio, m->mpio_pool); | 888 | mempool_free(mpio, m->mpio_pool); |
860 | 889 | ||
@@ -924,9 +953,13 @@ static int reinstate_path(struct pgpath *pgpath) | |||
924 | 953 | ||
925 | pgpath->is_active = 1; | 954 | pgpath->is_active = 1; |
926 | 955 | ||
927 | m->current_pgpath = NULL; | 956 | if (!m->nr_valid_paths++ && m->queue_size) { |
928 | if (!m->nr_valid_paths++ && m->queue_size) | 957 | m->current_pgpath = NULL; |
929 | queue_work(kmultipathd, &m->process_queued_ios); | 958 | queue_work(kmultipathd, &m->process_queued_ios); |
959 | } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { | ||
960 | if (queue_work(kmpath_handlerd, &pgpath->activate_path)) | ||
961 | m->pg_init_in_progress++; | ||
962 | } | ||
930 | 963 | ||
931 | dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, | 964 | dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, |
932 | pgpath->path.dev->name, m->nr_valid_paths); | 965 | pgpath->path.dev->name, m->nr_valid_paths); |
@@ -1102,87 +1135,70 @@ static void pg_init_done(struct dm_path *path, int errors) | |||
1102 | 1135 | ||
1103 | spin_lock_irqsave(&m->lock, flags); | 1136 | spin_lock_irqsave(&m->lock, flags); |
1104 | if (errors) { | 1137 | if (errors) { |
1105 | DMERR("Could not failover device. Error %d.", errors); | 1138 | if (pgpath == m->current_pgpath) { |
1106 | m->current_pgpath = NULL; | 1139 | DMERR("Could not failover device. Error %d.", errors); |
1107 | m->current_pg = NULL; | 1140 | m->current_pgpath = NULL; |
1141 | m->current_pg = NULL; | ||
1142 | } | ||
1108 | } else if (!m->pg_init_required) { | 1143 | } else if (!m->pg_init_required) { |
1109 | m->queue_io = 0; | 1144 | m->queue_io = 0; |
1110 | pg->bypassed = 0; | 1145 | pg->bypassed = 0; |
1111 | } | 1146 | } |
1112 | 1147 | ||
1113 | m->pg_init_in_progress = 0; | 1148 | m->pg_init_in_progress--; |
1114 | queue_work(kmultipathd, &m->process_queued_ios); | 1149 | if (!m->pg_init_in_progress) |
1150 | queue_work(kmultipathd, &m->process_queued_ios); | ||
1115 | spin_unlock_irqrestore(&m->lock, flags); | 1151 | spin_unlock_irqrestore(&m->lock, flags); |
1116 | } | 1152 | } |
1117 | 1153 | ||
1118 | static void activate_path(struct work_struct *work) | 1154 | static void activate_path(struct work_struct *work) |
1119 | { | 1155 | { |
1120 | int ret; | 1156 | int ret; |
1121 | struct multipath *m = | 1157 | struct pgpath *pgpath = |
1122 | container_of(work, struct multipath, activate_path); | 1158 | container_of(work, struct pgpath, activate_path); |
1123 | struct dm_path *path; | ||
1124 | unsigned long flags; | ||
1125 | 1159 | ||
1126 | spin_lock_irqsave(&m->lock, flags); | 1160 | ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev)); |
1127 | path = &m->pgpath_to_activate->path; | 1161 | pg_init_done(&pgpath->path, ret); |
1128 | m->pgpath_to_activate = NULL; | ||
1129 | spin_unlock_irqrestore(&m->lock, flags); | ||
1130 | if (!path) | ||
1131 | return; | ||
1132 | ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev)); | ||
1133 | pg_init_done(path, ret); | ||
1134 | } | 1162 | } |
1135 | 1163 | ||
1136 | /* | 1164 | /* |
1137 | * end_io handling | 1165 | * end_io handling |
1138 | */ | 1166 | */ |
1139 | static int do_end_io(struct multipath *m, struct bio *bio, | 1167 | static int do_end_io(struct multipath *m, struct request *clone, |
1140 | int error, struct dm_mpath_io *mpio) | 1168 | int error, struct dm_mpath_io *mpio) |
1141 | { | 1169 | { |
1170 | /* | ||
1171 | * We don't queue any clone request inside the multipath target | ||
1172 | * during end I/O handling, since those clone requests don't have | ||
1173 | * bio clones. If we queue them inside the multipath target, | ||
1174 | * we need to make bio clones, that requires memory allocation. | ||
1175 | * (See drivers/md/dm.c:end_clone_bio() about why the clone requests | ||
1176 | * don't have bio clones.) | ||
1177 | * Instead of queueing the clone request here, we queue the original | ||
1178 | * request into dm core, which will remake a clone request and | ||
1179 | * clone bios for it and resubmit it later. | ||
1180 | */ | ||
1181 | int r = DM_ENDIO_REQUEUE; | ||
1142 | unsigned long flags; | 1182 | unsigned long flags; |
1143 | 1183 | ||
1144 | if (!error) | 1184 | if (!error && !clone->errors) |
1145 | return 0; /* I/O complete */ | 1185 | return 0; /* I/O complete */ |
1146 | 1186 | ||
1147 | if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) | ||
1148 | return error; | ||
1149 | |||
1150 | if (error == -EOPNOTSUPP) | 1187 | if (error == -EOPNOTSUPP) |
1151 | return error; | 1188 | return error; |
1152 | 1189 | ||
1153 | spin_lock_irqsave(&m->lock, flags); | ||
1154 | if (!m->nr_valid_paths) { | ||
1155 | if (__must_push_back(m)) { | ||
1156 | spin_unlock_irqrestore(&m->lock, flags); | ||
1157 | return DM_ENDIO_REQUEUE; | ||
1158 | } else if (!m->queue_if_no_path) { | ||
1159 | spin_unlock_irqrestore(&m->lock, flags); | ||
1160 | return -EIO; | ||
1161 | } else { | ||
1162 | spin_unlock_irqrestore(&m->lock, flags); | ||
1163 | goto requeue; | ||
1164 | } | ||
1165 | } | ||
1166 | spin_unlock_irqrestore(&m->lock, flags); | ||
1167 | |||
1168 | if (mpio->pgpath) | 1190 | if (mpio->pgpath) |
1169 | fail_path(mpio->pgpath); | 1191 | fail_path(mpio->pgpath); |
1170 | 1192 | ||
1171 | requeue: | ||
1172 | dm_bio_restore(&mpio->details, bio); | ||
1173 | |||
1174 | /* queue for the daemon to resubmit or fail */ | ||
1175 | spin_lock_irqsave(&m->lock, flags); | 1193 | spin_lock_irqsave(&m->lock, flags); |
1176 | bio_list_add(&m->queued_ios, bio); | 1194 | if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m)) |
1177 | m->queue_size++; | 1195 | r = -EIO; |
1178 | if (!m->queue_io) | ||
1179 | queue_work(kmultipathd, &m->process_queued_ios); | ||
1180 | spin_unlock_irqrestore(&m->lock, flags); | 1196 | spin_unlock_irqrestore(&m->lock, flags); |
1181 | 1197 | ||
1182 | return DM_ENDIO_INCOMPLETE; /* io not complete */ | 1198 | return r; |
1183 | } | 1199 | } |
1184 | 1200 | ||
1185 | static int multipath_end_io(struct dm_target *ti, struct bio *bio, | 1201 | static int multipath_end_io(struct dm_target *ti, struct request *clone, |
1186 | int error, union map_info *map_context) | 1202 | int error, union map_info *map_context) |
1187 | { | 1203 | { |
1188 | struct multipath *m = ti->private; | 1204 | struct multipath *m = ti->private; |
@@ -1191,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio, | |||
1191 | struct path_selector *ps; | 1207 | struct path_selector *ps; |
1192 | int r; | 1208 | int r; |
1193 | 1209 | ||
1194 | r = do_end_io(m, bio, error, mpio); | 1210 | r = do_end_io(m, clone, error, mpio); |
1195 | if (pgpath) { | 1211 | if (pgpath) { |
1196 | ps = &pgpath->pg->ps; | 1212 | ps = &pgpath->pg->ps; |
1197 | if (ps->type->end_io) | 1213 | if (ps->type->end_io) |
1198 | ps->type->end_io(ps, &pgpath->path); | 1214 | ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); |
1199 | } | 1215 | } |
1200 | if (r != DM_ENDIO_INCOMPLETE) | 1216 | mempool_free(mpio, m->mpio_pool); |
1201 | mempool_free(mpio, m->mpio_pool); | ||
1202 | 1217 | ||
1203 | return r; | 1218 | return r; |
1204 | } | 1219 | } |
@@ -1411,7 +1426,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | |||
1411 | spin_lock_irqsave(&m->lock, flags); | 1426 | spin_lock_irqsave(&m->lock, flags); |
1412 | 1427 | ||
1413 | if (!m->current_pgpath) | 1428 | if (!m->current_pgpath) |
1414 | __choose_pgpath(m); | 1429 | __choose_pgpath(m, 0); |
1415 | 1430 | ||
1416 | if (m->current_pgpath) { | 1431 | if (m->current_pgpath) { |
1417 | bdev = m->current_pgpath->path.dev->bdev; | 1432 | bdev = m->current_pgpath->path.dev->bdev; |
@@ -1428,22 +1443,113 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | |||
1428 | return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); | 1443 | return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); |
1429 | } | 1444 | } |
1430 | 1445 | ||
1446 | static int multipath_iterate_devices(struct dm_target *ti, | ||
1447 | iterate_devices_callout_fn fn, void *data) | ||
1448 | { | ||
1449 | struct multipath *m = ti->private; | ||
1450 | struct priority_group *pg; | ||
1451 | struct pgpath *p; | ||
1452 | int ret = 0; | ||
1453 | |||
1454 | list_for_each_entry(pg, &m->priority_groups, list) { | ||
1455 | list_for_each_entry(p, &pg->pgpaths, list) { | ||
1456 | ret = fn(ti, p->path.dev, ti->begin, data); | ||
1457 | if (ret) | ||
1458 | goto out; | ||
1459 | } | ||
1460 | } | ||
1461 | |||
1462 | out: | ||
1463 | return ret; | ||
1464 | } | ||
1465 | |||
1466 | static int __pgpath_busy(struct pgpath *pgpath) | ||
1467 | { | ||
1468 | struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); | ||
1469 | |||
1470 | return dm_underlying_device_busy(q); | ||
1471 | } | ||
1472 | |||
1473 | /* | ||
1474 | * We return "busy", only when we can map I/Os but underlying devices | ||
1475 | * are busy (so even if we map I/Os now, the I/Os will wait on | ||
1476 | * the underlying queue). | ||
1477 | * In other words, if we want to kill I/Os or queue them inside us | ||
1478 | * due to map unavailability, we don't return "busy". Otherwise, | ||
1479 | * dm core won't give us the I/Os and we can't do what we want. | ||
1480 | */ | ||
1481 | static int multipath_busy(struct dm_target *ti) | ||
1482 | { | ||
1483 | int busy = 0, has_active = 0; | ||
1484 | struct multipath *m = ti->private; | ||
1485 | struct priority_group *pg; | ||
1486 | struct pgpath *pgpath; | ||
1487 | unsigned long flags; | ||
1488 | |||
1489 | spin_lock_irqsave(&m->lock, flags); | ||
1490 | |||
1491 | /* Guess which priority_group will be used at next mapping time */ | ||
1492 | if (unlikely(!m->current_pgpath && m->next_pg)) | ||
1493 | pg = m->next_pg; | ||
1494 | else if (likely(m->current_pg)) | ||
1495 | pg = m->current_pg; | ||
1496 | else | ||
1497 | /* | ||
1498 | * We don't know which pg will be used at next mapping time. | ||
1499 | * We don't call __choose_pgpath() here to avoid to trigger | ||
1500 | * pg_init just by busy checking. | ||
1501 | * So we don't know whether underlying devices we will be using | ||
1502 | * at next mapping time are busy or not. Just try mapping. | ||
1503 | */ | ||
1504 | goto out; | ||
1505 | |||
1506 | /* | ||
1507 | * If there is one non-busy active path at least, the path selector | ||
1508 | * will be able to select it. So we consider such a pg as not busy. | ||
1509 | */ | ||
1510 | busy = 1; | ||
1511 | list_for_each_entry(pgpath, &pg->pgpaths, list) | ||
1512 | if (pgpath->is_active) { | ||
1513 | has_active = 1; | ||
1514 | |||
1515 | if (!__pgpath_busy(pgpath)) { | ||
1516 | busy = 0; | ||
1517 | break; | ||
1518 | } | ||
1519 | } | ||
1520 | |||
1521 | if (!has_active) | ||
1522 | /* | ||
1523 | * No active path in this pg, so this pg won't be used and | ||
1524 | * the current_pg will be changed at next mapping time. | ||
1525 | * We need to try mapping to determine it. | ||
1526 | */ | ||
1527 | busy = 0; | ||
1528 | |||
1529 | out: | ||
1530 | spin_unlock_irqrestore(&m->lock, flags); | ||
1531 | |||
1532 | return busy; | ||
1533 | } | ||
1534 | |||
1431 | /*----------------------------------------------------------------- | 1535 | /*----------------------------------------------------------------- |
1432 | * Module setup | 1536 | * Module setup |
1433 | *---------------------------------------------------------------*/ | 1537 | *---------------------------------------------------------------*/ |
1434 | static struct target_type multipath_target = { | 1538 | static struct target_type multipath_target = { |
1435 | .name = "multipath", | 1539 | .name = "multipath", |
1436 | .version = {1, 0, 5}, | 1540 | .version = {1, 1, 0}, |
1437 | .module = THIS_MODULE, | 1541 | .module = THIS_MODULE, |
1438 | .ctr = multipath_ctr, | 1542 | .ctr = multipath_ctr, |
1439 | .dtr = multipath_dtr, | 1543 | .dtr = multipath_dtr, |
1440 | .map = multipath_map, | 1544 | .map_rq = multipath_map, |
1441 | .end_io = multipath_end_io, | 1545 | .rq_end_io = multipath_end_io, |
1442 | .presuspend = multipath_presuspend, | 1546 | .presuspend = multipath_presuspend, |
1443 | .resume = multipath_resume, | 1547 | .resume = multipath_resume, |
1444 | .status = multipath_status, | 1548 | .status = multipath_status, |
1445 | .message = multipath_message, | 1549 | .message = multipath_message, |
1446 | .ioctl = multipath_ioctl, | 1550 | .ioctl = multipath_ioctl, |
1551 | .iterate_devices = multipath_iterate_devices, | ||
1552 | .busy = multipath_busy, | ||
1447 | }; | 1553 | }; |
1448 | 1554 | ||
1449 | static int __init dm_multipath_init(void) | 1555 | static int __init dm_multipath_init(void) |
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 27357b85d73d..e7d1fa8b0459 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h | |||
@@ -56,7 +56,8 @@ struct path_selector_type { | |||
56 | * the path fails. | 56 | * the path fails. |
57 | */ | 57 | */ |
58 | struct dm_path *(*select_path) (struct path_selector *ps, | 58 | struct dm_path *(*select_path) (struct path_selector *ps, |
59 | unsigned *repeat_count); | 59 | unsigned *repeat_count, |
60 | size_t nr_bytes); | ||
60 | 61 | ||
61 | /* | 62 | /* |
62 | * Notify the selector that a path has failed. | 63 | * Notify the selector that a path has failed. |
@@ -75,7 +76,10 @@ struct path_selector_type { | |||
75 | int (*status) (struct path_selector *ps, struct dm_path *path, | 76 | int (*status) (struct path_selector *ps, struct dm_path *path, |
76 | status_type_t type, char *result, unsigned int maxlen); | 77 | status_type_t type, char *result, unsigned int maxlen); |
77 | 78 | ||
78 | int (*end_io) (struct path_selector *ps, struct dm_path *path); | 79 | int (*start_io) (struct path_selector *ps, struct dm_path *path, |
80 | size_t nr_bytes); | ||
81 | int (*end_io) (struct path_selector *ps, struct dm_path *path, | ||
82 | size_t nr_bytes); | ||
79 | }; | 83 | }; |
80 | 84 | ||
81 | /* Register a path selector */ | 85 | /* Register a path selector */ |
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c new file mode 100644 index 000000000000..f92b6cea9d9c --- /dev/null +++ b/drivers/md/dm-queue-length.c | |||
@@ -0,0 +1,263 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved. | ||
3 | * Copyright (C) 2006-2009 NEC Corporation. | ||
4 | * | ||
5 | * dm-queue-length.c | ||
6 | * | ||
7 | * Module Author: Stefan Bader, IBM | ||
8 | * Modified by: Kiyoshi Ueda, NEC | ||
9 | * | ||
10 | * This file is released under the GPL. | ||
11 | * | ||
12 | * queue-length path selector - choose a path with the least number of | ||
13 | * in-flight I/Os. | ||
14 | */ | ||
15 | |||
16 | #include "dm.h" | ||
17 | #include "dm-path-selector.h" | ||
18 | |||
19 | #include <linux/slab.h> | ||
20 | #include <linux/ctype.h> | ||
21 | #include <linux/errno.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <asm/atomic.h> | ||
24 | |||
25 | #define DM_MSG_PREFIX "multipath queue-length" | ||
26 | #define QL_MIN_IO 128 | ||
27 | #define QL_VERSION "0.1.0" | ||
28 | |||
29 | struct selector { | ||
30 | struct list_head valid_paths; | ||
31 | struct list_head failed_paths; | ||
32 | }; | ||
33 | |||
34 | struct path_info { | ||
35 | struct list_head list; | ||
36 | struct dm_path *path; | ||
37 | unsigned repeat_count; | ||
38 | atomic_t qlen; /* the number of in-flight I/Os */ | ||
39 | }; | ||
40 | |||
41 | static struct selector *alloc_selector(void) | ||
42 | { | ||
43 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
44 | |||
45 | if (s) { | ||
46 | INIT_LIST_HEAD(&s->valid_paths); | ||
47 | INIT_LIST_HEAD(&s->failed_paths); | ||
48 | } | ||
49 | |||
50 | return s; | ||
51 | } | ||
52 | |||
53 | static int ql_create(struct path_selector *ps, unsigned argc, char **argv) | ||
54 | { | ||
55 | struct selector *s = alloc_selector(); | ||
56 | |||
57 | if (!s) | ||
58 | return -ENOMEM; | ||
59 | |||
60 | ps->context = s; | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | static void ql_free_paths(struct list_head *paths) | ||
65 | { | ||
66 | struct path_info *pi, *next; | ||
67 | |||
68 | list_for_each_entry_safe(pi, next, paths, list) { | ||
69 | list_del(&pi->list); | ||
70 | kfree(pi); | ||
71 | } | ||
72 | } | ||
73 | |||
74 | static void ql_destroy(struct path_selector *ps) | ||
75 | { | ||
76 | struct selector *s = ps->context; | ||
77 | |||
78 | ql_free_paths(&s->valid_paths); | ||
79 | ql_free_paths(&s->failed_paths); | ||
80 | kfree(s); | ||
81 | ps->context = NULL; | ||
82 | } | ||
83 | |||
84 | static int ql_status(struct path_selector *ps, struct dm_path *path, | ||
85 | status_type_t type, char *result, unsigned maxlen) | ||
86 | { | ||
87 | unsigned sz = 0; | ||
88 | struct path_info *pi; | ||
89 | |||
90 | /* When called with NULL path, return selector status/args. */ | ||
91 | if (!path) | ||
92 | DMEMIT("0 "); | ||
93 | else { | ||
94 | pi = path->pscontext; | ||
95 | |||
96 | switch (type) { | ||
97 | case STATUSTYPE_INFO: | ||
98 | DMEMIT("%d ", atomic_read(&pi->qlen)); | ||
99 | break; | ||
100 | case STATUSTYPE_TABLE: | ||
101 | DMEMIT("%u ", pi->repeat_count); | ||
102 | break; | ||
103 | } | ||
104 | } | ||
105 | |||
106 | return sz; | ||
107 | } | ||
108 | |||
109 | static int ql_add_path(struct path_selector *ps, struct dm_path *path, | ||
110 | int argc, char **argv, char **error) | ||
111 | { | ||
112 | struct selector *s = ps->context; | ||
113 | struct path_info *pi; | ||
114 | unsigned repeat_count = QL_MIN_IO; | ||
115 | |||
116 | /* | ||
117 | * Arguments: [<repeat_count>] | ||
118 | * <repeat_count>: The number of I/Os before switching path. | ||
119 | * If not given, default (QL_MIN_IO) is used. | ||
120 | */ | ||
121 | if (argc > 1) { | ||
122 | *error = "queue-length ps: incorrect number of arguments"; | ||
123 | return -EINVAL; | ||
124 | } | ||
125 | |||
126 | if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | ||
127 | *error = "queue-length ps: invalid repeat count"; | ||
128 | return -EINVAL; | ||
129 | } | ||
130 | |||
131 | /* Allocate the path information structure */ | ||
132 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | ||
133 | if (!pi) { | ||
134 | *error = "queue-length ps: Error allocating path information"; | ||
135 | return -ENOMEM; | ||
136 | } | ||
137 | |||
138 | pi->path = path; | ||
139 | pi->repeat_count = repeat_count; | ||
140 | atomic_set(&pi->qlen, 0); | ||
141 | |||
142 | path->pscontext = pi; | ||
143 | |||
144 | list_add_tail(&pi->list, &s->valid_paths); | ||
145 | |||
146 | return 0; | ||
147 | } | ||
148 | |||
149 | static void ql_fail_path(struct path_selector *ps, struct dm_path *path) | ||
150 | { | ||
151 | struct selector *s = ps->context; | ||
152 | struct path_info *pi = path->pscontext; | ||
153 | |||
154 | list_move(&pi->list, &s->failed_paths); | ||
155 | } | ||
156 | |||
157 | static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) | ||
158 | { | ||
159 | struct selector *s = ps->context; | ||
160 | struct path_info *pi = path->pscontext; | ||
161 | |||
162 | list_move_tail(&pi->list, &s->valid_paths); | ||
163 | |||
164 | return 0; | ||
165 | } | ||
166 | |||
167 | /* | ||
168 | * Select a path having the minimum number of in-flight I/Os | ||
169 | */ | ||
170 | static struct dm_path *ql_select_path(struct path_selector *ps, | ||
171 | unsigned *repeat_count, size_t nr_bytes) | ||
172 | { | ||
173 | struct selector *s = ps->context; | ||
174 | struct path_info *pi = NULL, *best = NULL; | ||
175 | |||
176 | if (list_empty(&s->valid_paths)) | ||
177 | return NULL; | ||
178 | |||
179 | /* Change preferred (first in list) path to evenly balance. */ | ||
180 | list_move_tail(s->valid_paths.next, &s->valid_paths); | ||
181 | |||
182 | list_for_each_entry(pi, &s->valid_paths, list) { | ||
183 | if (!best || | ||
184 | (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) | ||
185 | best = pi; | ||
186 | |||
187 | if (!atomic_read(&best->qlen)) | ||
188 | break; | ||
189 | } | ||
190 | |||
191 | if (!best) | ||
192 | return NULL; | ||
193 | |||
194 | *repeat_count = best->repeat_count; | ||
195 | |||
196 | return best->path; | ||
197 | } | ||
198 | |||
199 | static int ql_start_io(struct path_selector *ps, struct dm_path *path, | ||
200 | size_t nr_bytes) | ||
201 | { | ||
202 | struct path_info *pi = path->pscontext; | ||
203 | |||
204 | atomic_inc(&pi->qlen); | ||
205 | |||
206 | return 0; | ||
207 | } | ||
208 | |||
209 | static int ql_end_io(struct path_selector *ps, struct dm_path *path, | ||
210 | size_t nr_bytes) | ||
211 | { | ||
212 | struct path_info *pi = path->pscontext; | ||
213 | |||
214 | atomic_dec(&pi->qlen); | ||
215 | |||
216 | return 0; | ||
217 | } | ||
218 | |||
219 | static struct path_selector_type ql_ps = { | ||
220 | .name = "queue-length", | ||
221 | .module = THIS_MODULE, | ||
222 | .table_args = 1, | ||
223 | .info_args = 1, | ||
224 | .create = ql_create, | ||
225 | .destroy = ql_destroy, | ||
226 | .status = ql_status, | ||
227 | .add_path = ql_add_path, | ||
228 | .fail_path = ql_fail_path, | ||
229 | .reinstate_path = ql_reinstate_path, | ||
230 | .select_path = ql_select_path, | ||
231 | .start_io = ql_start_io, | ||
232 | .end_io = ql_end_io, | ||
233 | }; | ||
234 | |||
235 | static int __init dm_ql_init(void) | ||
236 | { | ||
237 | int r = dm_register_path_selector(&ql_ps); | ||
238 | |||
239 | if (r < 0) | ||
240 | DMERR("register failed %d", r); | ||
241 | |||
242 | DMINFO("version " QL_VERSION " loaded"); | ||
243 | |||
244 | return r; | ||
245 | } | ||
246 | |||
247 | static void __exit dm_ql_exit(void) | ||
248 | { | ||
249 | int r = dm_unregister_path_selector(&ql_ps); | ||
250 | |||
251 | if (r < 0) | ||
252 | DMERR("unregister failed %d", r); | ||
253 | } | ||
254 | |||
255 | module_init(dm_ql_init); | ||
256 | module_exit(dm_ql_exit); | ||
257 | |||
258 | MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>"); | ||
259 | MODULE_DESCRIPTION( | ||
260 | "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n" | ||
261 | DM_NAME " path selector to balance the number of in-flight I/Os" | ||
262 | ); | ||
263 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 076fbb4e967a..ce8868c768cc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type, | |||
1283 | return 0; | 1283 | return 0; |
1284 | } | 1284 | } |
1285 | 1285 | ||
1286 | static int mirror_iterate_devices(struct dm_target *ti, | ||
1287 | iterate_devices_callout_fn fn, void *data) | ||
1288 | { | ||
1289 | struct mirror_set *ms = ti->private; | ||
1290 | int ret = 0; | ||
1291 | unsigned i; | ||
1292 | |||
1293 | for (i = 0; !ret && i < ms->nr_mirrors; i++) | ||
1294 | ret = fn(ti, ms->mirror[i].dev, | ||
1295 | ms->mirror[i].offset, data); | ||
1296 | |||
1297 | return ret; | ||
1298 | } | ||
1299 | |||
1286 | static struct target_type mirror_target = { | 1300 | static struct target_type mirror_target = { |
1287 | .name = "mirror", | 1301 | .name = "mirror", |
1288 | .version = {1, 0, 20}, | 1302 | .version = {1, 12, 0}, |
1289 | .module = THIS_MODULE, | 1303 | .module = THIS_MODULE, |
1290 | .ctr = mirror_ctr, | 1304 | .ctr = mirror_ctr, |
1291 | .dtr = mirror_dtr, | 1305 | .dtr = mirror_dtr, |
@@ -1295,6 +1309,7 @@ static struct target_type mirror_target = { | |||
1295 | .postsuspend = mirror_postsuspend, | 1309 | .postsuspend = mirror_postsuspend, |
1296 | .resume = mirror_resume, | 1310 | .resume = mirror_resume, |
1297 | .status = mirror_status, | 1311 | .status = mirror_status, |
1312 | .iterate_devices = mirror_iterate_devices, | ||
1298 | }; | 1313 | }; |
1299 | 1314 | ||
1300 | static int __init dm_mirror_init(void) | 1315 | static int __init dm_mirror_init(void) |
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 7b899be0b087..36dbe29f2fd6 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c | |||
@@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) | |||
283 | 283 | ||
284 | nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); | 284 | nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); |
285 | if (unlikely(!nreg)) | 285 | if (unlikely(!nreg)) |
286 | nreg = kmalloc(sizeof(*nreg), GFP_NOIO); | 286 | nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); |
287 | 287 | ||
288 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? | 288 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? |
289 | DM_RH_CLEAN : DM_RH_NOSYNC; | 289 | DM_RH_CLEAN : DM_RH_NOSYNC; |
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index cdfbf65b28cb..24752f449bef 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c | |||
@@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) | |||
161 | } | 161 | } |
162 | 162 | ||
163 | static struct dm_path *rr_select_path(struct path_selector *ps, | 163 | static struct dm_path *rr_select_path(struct path_selector *ps, |
164 | unsigned *repeat_count) | 164 | unsigned *repeat_count, size_t nr_bytes) |
165 | { | 165 | { |
166 | struct selector *s = (struct selector *) ps->context; | 166 | struct selector *s = (struct selector *) ps->context; |
167 | struct path_info *pi = NULL; | 167 | struct path_info *pi = NULL; |
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c new file mode 100644 index 000000000000..cfa668f46c40 --- /dev/null +++ b/drivers/md/dm-service-time.c | |||
@@ -0,0 +1,339 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * Module Author: Kiyoshi Ueda | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | * | ||
8 | * Throughput oriented path selector. | ||
9 | */ | ||
10 | |||
11 | #include "dm.h" | ||
12 | #include "dm-path-selector.h" | ||
13 | |||
14 | #define DM_MSG_PREFIX "multipath service-time" | ||
15 | #define ST_MIN_IO 1 | ||
16 | #define ST_MAX_RELATIVE_THROUGHPUT 100 | ||
17 | #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 | ||
18 | #define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) | ||
19 | #define ST_VERSION "0.2.0" | ||
20 | |||
21 | struct selector { | ||
22 | struct list_head valid_paths; | ||
23 | struct list_head failed_paths; | ||
24 | }; | ||
25 | |||
26 | struct path_info { | ||
27 | struct list_head list; | ||
28 | struct dm_path *path; | ||
29 | unsigned repeat_count; | ||
30 | unsigned relative_throughput; | ||
31 | atomic_t in_flight_size; /* Total size of in-flight I/Os */ | ||
32 | }; | ||
33 | |||
34 | static struct selector *alloc_selector(void) | ||
35 | { | ||
36 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
37 | |||
38 | if (s) { | ||
39 | INIT_LIST_HEAD(&s->valid_paths); | ||
40 | INIT_LIST_HEAD(&s->failed_paths); | ||
41 | } | ||
42 | |||
43 | return s; | ||
44 | } | ||
45 | |||
46 | static int st_create(struct path_selector *ps, unsigned argc, char **argv) | ||
47 | { | ||
48 | struct selector *s = alloc_selector(); | ||
49 | |||
50 | if (!s) | ||
51 | return -ENOMEM; | ||
52 | |||
53 | ps->context = s; | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static void free_paths(struct list_head *paths) | ||
58 | { | ||
59 | struct path_info *pi, *next; | ||
60 | |||
61 | list_for_each_entry_safe(pi, next, paths, list) { | ||
62 | list_del(&pi->list); | ||
63 | kfree(pi); | ||
64 | } | ||
65 | } | ||
66 | |||
67 | static void st_destroy(struct path_selector *ps) | ||
68 | { | ||
69 | struct selector *s = ps->context; | ||
70 | |||
71 | free_paths(&s->valid_paths); | ||
72 | free_paths(&s->failed_paths); | ||
73 | kfree(s); | ||
74 | ps->context = NULL; | ||
75 | } | ||
76 | |||
77 | static int st_status(struct path_selector *ps, struct dm_path *path, | ||
78 | status_type_t type, char *result, unsigned maxlen) | ||
79 | { | ||
80 | unsigned sz = 0; | ||
81 | struct path_info *pi; | ||
82 | |||
83 | if (!path) | ||
84 | DMEMIT("0 "); | ||
85 | else { | ||
86 | pi = path->pscontext; | ||
87 | |||
88 | switch (type) { | ||
89 | case STATUSTYPE_INFO: | ||
90 | DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), | ||
91 | pi->relative_throughput); | ||
92 | break; | ||
93 | case STATUSTYPE_TABLE: | ||
94 | DMEMIT("%u %u ", pi->repeat_count, | ||
95 | pi->relative_throughput); | ||
96 | break; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | return sz; | ||
101 | } | ||
102 | |||
103 | static int st_add_path(struct path_selector *ps, struct dm_path *path, | ||
104 | int argc, char **argv, char **error) | ||
105 | { | ||
106 | struct selector *s = ps->context; | ||
107 | struct path_info *pi; | ||
108 | unsigned repeat_count = ST_MIN_IO; | ||
109 | unsigned relative_throughput = 1; | ||
110 | |||
111 | /* | ||
112 | * Arguments: [<repeat_count> [<relative_throughput>]] | ||
113 | * <repeat_count>: The number of I/Os before switching path. | ||
114 | * If not given, default (ST_MIN_IO) is used. | ||
115 | * <relative_throughput>: The relative throughput value of | ||
116 | * the path among all paths in the path-group. | ||
117 | * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> | ||
118 | * If not given, minimum value '1' is used. | ||
119 | * If '0' is given, the path isn't selected while | ||
120 | * other paths having a positive value are | ||
121 | * available. | ||
122 | */ | ||
123 | if (argc > 2) { | ||
124 | *error = "service-time ps: incorrect number of arguments"; | ||
125 | return -EINVAL; | ||
126 | } | ||
127 | |||
128 | if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | ||
129 | *error = "service-time ps: invalid repeat count"; | ||
130 | return -EINVAL; | ||
131 | } | ||
132 | |||
133 | if ((argc == 2) && | ||
134 | (sscanf(argv[1], "%u", &relative_throughput) != 1 || | ||
135 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { | ||
136 | *error = "service-time ps: invalid relative_throughput value"; | ||
137 | return -EINVAL; | ||
138 | } | ||
139 | |||
140 | /* allocate the path */ | ||
141 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | ||
142 | if (!pi) { | ||
143 | *error = "service-time ps: Error allocating path context"; | ||
144 | return -ENOMEM; | ||
145 | } | ||
146 | |||
147 | pi->path = path; | ||
148 | pi->repeat_count = repeat_count; | ||
149 | pi->relative_throughput = relative_throughput; | ||
150 | atomic_set(&pi->in_flight_size, 0); | ||
151 | |||
152 | path->pscontext = pi; | ||
153 | |||
154 | list_add_tail(&pi->list, &s->valid_paths); | ||
155 | |||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | static void st_fail_path(struct path_selector *ps, struct dm_path *path) | ||
160 | { | ||
161 | struct selector *s = ps->context; | ||
162 | struct path_info *pi = path->pscontext; | ||
163 | |||
164 | list_move(&pi->list, &s->failed_paths); | ||
165 | } | ||
166 | |||
167 | static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) | ||
168 | { | ||
169 | struct selector *s = ps->context; | ||
170 | struct path_info *pi = path->pscontext; | ||
171 | |||
172 | list_move_tail(&pi->list, &s->valid_paths); | ||
173 | |||
174 | return 0; | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Compare the estimated service time of 2 paths, pi1 and pi2, | ||
179 | * for the incoming I/O. | ||
180 | * | ||
181 | * Returns: | ||
182 | * < 0 : pi1 is better | ||
183 | * 0 : no difference between pi1 and pi2 | ||
184 | * > 0 : pi2 is better | ||
185 | * | ||
186 | * Description: | ||
187 | * Basically, the service time is estimated by: | ||
188 | * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' | ||
189 | * To reduce the calculation, some optimizations are made. | ||
190 | * (See comments inline) | ||
191 | */ | ||
192 | static int st_compare_load(struct path_info *pi1, struct path_info *pi2, | ||
193 | size_t incoming) | ||
194 | { | ||
195 | size_t sz1, sz2, st1, st2; | ||
196 | |||
197 | sz1 = atomic_read(&pi1->in_flight_size); | ||
198 | sz2 = atomic_read(&pi2->in_flight_size); | ||
199 | |||
200 | /* | ||
201 | * Case 1: Both have same throughput value. Choose less loaded path. | ||
202 | */ | ||
203 | if (pi1->relative_throughput == pi2->relative_throughput) | ||
204 | return sz1 - sz2; | ||
205 | |||
206 | /* | ||
207 | * Case 2a: Both have same load. Choose higher throughput path. | ||
208 | * Case 2b: One path has no throughput value. Choose the other one. | ||
209 | */ | ||
210 | if (sz1 == sz2 || | ||
211 | !pi1->relative_throughput || !pi2->relative_throughput) | ||
212 | return pi2->relative_throughput - pi1->relative_throughput; | ||
213 | |||
214 | /* | ||
215 | * Case 3: Calculate service time. Choose faster path. | ||
216 | * Service time using pi1: | ||
217 | * st1 = (sz1 + incoming) / pi1->relative_throughput | ||
218 | * Service time using pi2: | ||
219 | * st2 = (sz2 + incoming) / pi2->relative_throughput | ||
220 | * | ||
221 | * To avoid the division, transform the expression to use | ||
222 | * multiplication. | ||
223 | * Because ->relative_throughput > 0 here, if st1 < st2, | ||
224 | * the expressions below are the same meaning: | ||
225 | * (sz1 + incoming) / pi1->relative_throughput < | ||
226 | * (sz2 + incoming) / pi2->relative_throughput | ||
227 | * (sz1 + incoming) * pi2->relative_throughput < | ||
228 | * (sz2 + incoming) * pi1->relative_throughput | ||
229 | * So use the later one. | ||
230 | */ | ||
231 | sz1 += incoming; | ||
232 | sz2 += incoming; | ||
233 | if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || | ||
234 | sz2 >= ST_MAX_INFLIGHT_SIZE)) { | ||
235 | /* | ||
236 | * Size may be too big for multiplying pi->relative_throughput | ||
237 | * and overflow. | ||
238 | * To avoid the overflow and mis-selection, shift down both. | ||
239 | */ | ||
240 | sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | ||
241 | sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | ||
242 | } | ||
243 | st1 = sz1 * pi2->relative_throughput; | ||
244 | st2 = sz2 * pi1->relative_throughput; | ||
245 | if (st1 != st2) | ||
246 | return st1 - st2; | ||
247 | |||
248 | /* | ||
249 | * Case 4: Service time is equal. Choose higher throughput path. | ||
250 | */ | ||
251 | return pi2->relative_throughput - pi1->relative_throughput; | ||
252 | } | ||
253 | |||
254 | static struct dm_path *st_select_path(struct path_selector *ps, | ||
255 | unsigned *repeat_count, size_t nr_bytes) | ||
256 | { | ||
257 | struct selector *s = ps->context; | ||
258 | struct path_info *pi = NULL, *best = NULL; | ||
259 | |||
260 | if (list_empty(&s->valid_paths)) | ||
261 | return NULL; | ||
262 | |||
263 | /* Change preferred (first in list) path to evenly balance. */ | ||
264 | list_move_tail(s->valid_paths.next, &s->valid_paths); | ||
265 | |||
266 | list_for_each_entry(pi, &s->valid_paths, list) | ||
267 | if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) | ||
268 | best = pi; | ||
269 | |||
270 | if (!best) | ||
271 | return NULL; | ||
272 | |||
273 | *repeat_count = best->repeat_count; | ||
274 | |||
275 | return best->path; | ||
276 | } | ||
277 | |||
278 | static int st_start_io(struct path_selector *ps, struct dm_path *path, | ||
279 | size_t nr_bytes) | ||
280 | { | ||
281 | struct path_info *pi = path->pscontext; | ||
282 | |||
283 | atomic_add(nr_bytes, &pi->in_flight_size); | ||
284 | |||
285 | return 0; | ||
286 | } | ||
287 | |||
288 | static int st_end_io(struct path_selector *ps, struct dm_path *path, | ||
289 | size_t nr_bytes) | ||
290 | { | ||
291 | struct path_info *pi = path->pscontext; | ||
292 | |||
293 | atomic_sub(nr_bytes, &pi->in_flight_size); | ||
294 | |||
295 | return 0; | ||
296 | } | ||
297 | |||
298 | static struct path_selector_type st_ps = { | ||
299 | .name = "service-time", | ||
300 | .module = THIS_MODULE, | ||
301 | .table_args = 2, | ||
302 | .info_args = 2, | ||
303 | .create = st_create, | ||
304 | .destroy = st_destroy, | ||
305 | .status = st_status, | ||
306 | .add_path = st_add_path, | ||
307 | .fail_path = st_fail_path, | ||
308 | .reinstate_path = st_reinstate_path, | ||
309 | .select_path = st_select_path, | ||
310 | .start_io = st_start_io, | ||
311 | .end_io = st_end_io, | ||
312 | }; | ||
313 | |||
314 | static int __init dm_st_init(void) | ||
315 | { | ||
316 | int r = dm_register_path_selector(&st_ps); | ||
317 | |||
318 | if (r < 0) | ||
319 | DMERR("register failed %d", r); | ||
320 | |||
321 | DMINFO("version " ST_VERSION " loaded"); | ||
322 | |||
323 | return r; | ||
324 | } | ||
325 | |||
326 | static void __exit dm_st_exit(void) | ||
327 | { | ||
328 | int r = dm_unregister_path_selector(&st_ps); | ||
329 | |||
330 | if (r < 0) | ||
331 | DMERR("unregister failed %d", r); | ||
332 | } | ||
333 | |||
334 | module_init(dm_st_init); | ||
335 | module_exit(dm_st_exit); | ||
336 | |||
337 | MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); | ||
338 | MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); | ||
339 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index e75c6dd76a9a..6e3fe4f14934 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
@@ -282,7 +282,7 @@ static int read_header(struct pstore *ps, int *new_snapshot) | |||
282 | */ | 282 | */ |
283 | if (!ps->store->chunk_size) { | 283 | if (!ps->store->chunk_size) { |
284 | ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, | 284 | ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, |
285 | bdev_hardsect_size(ps->store->cow->bdev) >> 9); | 285 | bdev_logical_block_size(ps->store->cow->bdev) >> 9); |
286 | ps->store->chunk_mask = ps->store->chunk_size - 1; | 286 | ps->store->chunk_mask = ps->store->chunk_size - 1; |
287 | ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; | 287 | ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; |
288 | chunk_size_supplied = 0; | 288 | chunk_size_supplied = 0; |
@@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, | |||
636 | /* | 636 | /* |
637 | * Commit exceptions to disk. | 637 | * Commit exceptions to disk. |
638 | */ | 638 | */ |
639 | if (ps->valid && area_io(ps, WRITE)) | 639 | if (ps->valid && area_io(ps, WRITE_BARRIER)) |
640 | ps->valid = 0; | 640 | ps->valid = 0; |
641 | 641 | ||
642 | /* | 642 | /* |
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index d73f17fc7778..d573165cd2b7 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
@@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
678 | 678 | ||
679 | ti->private = s; | 679 | ti->private = s; |
680 | ti->split_io = s->store->chunk_size; | 680 | ti->split_io = s->store->chunk_size; |
681 | ti->num_flush_requests = 1; | ||
681 | 682 | ||
682 | return 0; | 683 | return 0; |
683 | 684 | ||
@@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1030 | chunk_t chunk; | 1031 | chunk_t chunk; |
1031 | struct dm_snap_pending_exception *pe = NULL; | 1032 | struct dm_snap_pending_exception *pe = NULL; |
1032 | 1033 | ||
1034 | if (unlikely(bio_empty_barrier(bio))) { | ||
1035 | bio->bi_bdev = s->store->cow->bdev; | ||
1036 | return DM_MAPIO_REMAPPED; | ||
1037 | } | ||
1038 | |||
1033 | chunk = sector_to_chunk(s->store, bio->bi_sector); | 1039 | chunk = sector_to_chunk(s->store, bio->bi_sector); |
1034 | 1040 | ||
1035 | /* Full snapshots are not usable */ | 1041 | /* Full snapshots are not usable */ |
@@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1338 | } | 1344 | } |
1339 | 1345 | ||
1340 | ti->private = dev; | 1346 | ti->private = dev; |
1347 | ti->num_flush_requests = 1; | ||
1348 | |||
1341 | return 0; | 1349 | return 0; |
1342 | } | 1350 | } |
1343 | 1351 | ||
@@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio, | |||
1353 | struct dm_dev *dev = ti->private; | 1361 | struct dm_dev *dev = ti->private; |
1354 | bio->bi_bdev = dev->bdev; | 1362 | bio->bi_bdev = dev->bdev; |
1355 | 1363 | ||
1364 | if (unlikely(bio_empty_barrier(bio))) | ||
1365 | return DM_MAPIO_REMAPPED; | ||
1366 | |||
1356 | /* Only tell snapshots if this is a write */ | 1367 | /* Only tell snapshots if this is a write */ |
1357 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; | 1368 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; |
1358 | } | 1369 | } |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 41569bc60abc..b240e85ae39a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
167 | sc->stripes = stripes; | 167 | sc->stripes = stripes; |
168 | sc->stripe_width = width; | 168 | sc->stripe_width = width; |
169 | ti->split_io = chunk_size; | 169 | ti->split_io = chunk_size; |
170 | ti->num_flush_requests = stripes; | ||
170 | 171 | ||
171 | sc->chunk_mask = ((sector_t) chunk_size) - 1; | 172 | sc->chunk_mask = ((sector_t) chunk_size) - 1; |
172 | for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) | 173 | for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) |
@@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, | |||
211 | union map_info *map_context) | 212 | union map_info *map_context) |
212 | { | 213 | { |
213 | struct stripe_c *sc = (struct stripe_c *) ti->private; | 214 | struct stripe_c *sc = (struct stripe_c *) ti->private; |
215 | sector_t offset, chunk; | ||
216 | uint32_t stripe; | ||
214 | 217 | ||
215 | sector_t offset = bio->bi_sector - ti->begin; | 218 | if (unlikely(bio_empty_barrier(bio))) { |
216 | sector_t chunk = offset >> sc->chunk_shift; | 219 | BUG_ON(map_context->flush_request >= sc->stripes); |
217 | uint32_t stripe = sector_div(chunk, sc->stripes); | 220 | bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev; |
221 | return DM_MAPIO_REMAPPED; | ||
222 | } | ||
223 | |||
224 | offset = bio->bi_sector - ti->begin; | ||
225 | chunk = offset >> sc->chunk_shift; | ||
226 | stripe = sector_div(chunk, sc->stripes); | ||
218 | 227 | ||
219 | bio->bi_bdev = sc->stripe[stripe].dev->bdev; | 228 | bio->bi_bdev = sc->stripe[stripe].dev->bdev; |
220 | bio->bi_sector = sc->stripe[stripe].physical_start + | 229 | bio->bi_sector = sc->stripe[stripe].physical_start + |
@@ -304,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, | |||
304 | return error; | 313 | return error; |
305 | } | 314 | } |
306 | 315 | ||
316 | static int stripe_iterate_devices(struct dm_target *ti, | ||
317 | iterate_devices_callout_fn fn, void *data) | ||
318 | { | ||
319 | struct stripe_c *sc = ti->private; | ||
320 | int ret = 0; | ||
321 | unsigned i = 0; | ||
322 | |||
323 | do | ||
324 | ret = fn(ti, sc->stripe[i].dev, | ||
325 | sc->stripe[i].physical_start, data); | ||
326 | while (!ret && ++i < sc->stripes); | ||
327 | |||
328 | return ret; | ||
329 | } | ||
330 | |||
307 | static struct target_type stripe_target = { | 331 | static struct target_type stripe_target = { |
308 | .name = "striped", | 332 | .name = "striped", |
309 | .version = {1, 1, 0}, | 333 | .version = {1, 2, 0}, |
310 | .module = THIS_MODULE, | 334 | .module = THIS_MODULE, |
311 | .ctr = stripe_ctr, | 335 | .ctr = stripe_ctr, |
312 | .dtr = stripe_dtr, | 336 | .dtr = stripe_dtr, |
313 | .map = stripe_map, | 337 | .map = stripe_map, |
314 | .end_io = stripe_end_io, | 338 | .end_io = stripe_end_io, |
315 | .status = stripe_status, | 339 | .status = stripe_status, |
340 | .iterate_devices = stripe_iterate_devices, | ||
316 | }; | 341 | }; |
317 | 342 | ||
318 | int __init dm_stripe_init(void) | 343 | int __init dm_stripe_init(void) |
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index a2a45e6c7c8b..4b045903a4e2 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c | |||
@@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) | |||
57 | return strlen(buf); | 57 | return strlen(buf); |
58 | } | 58 | } |
59 | 59 | ||
60 | static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) | ||
61 | { | ||
62 | sprintf(buf, "%d\n", dm_suspended(md)); | ||
63 | |||
64 | return strlen(buf); | ||
65 | } | ||
66 | |||
60 | static DM_ATTR_RO(name); | 67 | static DM_ATTR_RO(name); |
61 | static DM_ATTR_RO(uuid); | 68 | static DM_ATTR_RO(uuid); |
69 | static DM_ATTR_RO(suspended); | ||
62 | 70 | ||
63 | static struct attribute *dm_attrs[] = { | 71 | static struct attribute *dm_attrs[] = { |
64 | &dm_attr_name.attr, | 72 | &dm_attr_name.attr, |
65 | &dm_attr_uuid.attr, | 73 | &dm_attr_uuid.attr, |
74 | &dm_attr_suspended.attr, | ||
66 | NULL, | 75 | NULL, |
67 | }; | 76 | }; |
68 | 77 | ||
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 429b50b975d5..4899ebe767c8 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -41,6 +41,7 @@ | |||
41 | struct dm_table { | 41 | struct dm_table { |
42 | struct mapped_device *md; | 42 | struct mapped_device *md; |
43 | atomic_t holders; | 43 | atomic_t holders; |
44 | unsigned type; | ||
44 | 45 | ||
45 | /* btree table */ | 46 | /* btree table */ |
46 | unsigned int depth; | 47 | unsigned int depth; |
@@ -62,15 +63,11 @@ struct dm_table { | |||
62 | /* a list of devices used by this table */ | 63 | /* a list of devices used by this table */ |
63 | struct list_head devices; | 64 | struct list_head devices; |
64 | 65 | ||
65 | /* | ||
66 | * These are optimistic limits taken from all the | ||
67 | * targets, some targets will need smaller limits. | ||
68 | */ | ||
69 | struct io_restrictions limits; | ||
70 | |||
71 | /* events get handed up using this callback */ | 66 | /* events get handed up using this callback */ |
72 | void (*event_fn)(void *); | 67 | void (*event_fn)(void *); |
73 | void *event_context; | 68 | void *event_context; |
69 | |||
70 | struct dm_md_mempools *mempools; | ||
74 | }; | 71 | }; |
75 | 72 | ||
76 | /* | 73 | /* |
@@ -89,42 +86,6 @@ static unsigned int int_log(unsigned int n, unsigned int base) | |||
89 | } | 86 | } |
90 | 87 | ||
91 | /* | 88 | /* |
92 | * Returns the minimum that is _not_ zero, unless both are zero. | ||
93 | */ | ||
94 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
95 | |||
96 | /* | ||
97 | * Combine two io_restrictions, always taking the lower value. | ||
98 | */ | ||
99 | static void combine_restrictions_low(struct io_restrictions *lhs, | ||
100 | struct io_restrictions *rhs) | ||
101 | { | ||
102 | lhs->max_sectors = | ||
103 | min_not_zero(lhs->max_sectors, rhs->max_sectors); | ||
104 | |||
105 | lhs->max_phys_segments = | ||
106 | min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments); | ||
107 | |||
108 | lhs->max_hw_segments = | ||
109 | min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments); | ||
110 | |||
111 | lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size); | ||
112 | |||
113 | lhs->max_segment_size = | ||
114 | min_not_zero(lhs->max_segment_size, rhs->max_segment_size); | ||
115 | |||
116 | lhs->max_hw_sectors = | ||
117 | min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors); | ||
118 | |||
119 | lhs->seg_boundary_mask = | ||
120 | min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask); | ||
121 | |||
122 | lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn); | ||
123 | |||
124 | lhs->no_cluster |= rhs->no_cluster; | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * Calculate the index of the child node of the n'th node k'th key. | 89 | * Calculate the index of the child node of the n'th node k'th key. |
129 | */ | 90 | */ |
130 | static inline unsigned int get_child(unsigned int n, unsigned int k) | 91 | static inline unsigned int get_child(unsigned int n, unsigned int k) |
@@ -266,6 +227,8 @@ static void free_devices(struct list_head *devices) | |||
266 | list_for_each_safe(tmp, next, devices) { | 227 | list_for_each_safe(tmp, next, devices) { |
267 | struct dm_dev_internal *dd = | 228 | struct dm_dev_internal *dd = |
268 | list_entry(tmp, struct dm_dev_internal, list); | 229 | list_entry(tmp, struct dm_dev_internal, list); |
230 | DMWARN("dm_table_destroy: dm_put_device call missing for %s", | ||
231 | dd->dm_dev.name); | ||
269 | kfree(dd); | 232 | kfree(dd); |
270 | } | 233 | } |
271 | } | 234 | } |
@@ -295,12 +258,10 @@ void dm_table_destroy(struct dm_table *t) | |||
295 | vfree(t->highs); | 258 | vfree(t->highs); |
296 | 259 | ||
297 | /* free the device list */ | 260 | /* free the device list */ |
298 | if (t->devices.next != &t->devices) { | 261 | if (t->devices.next != &t->devices) |
299 | DMWARN("devices still present during destroy: " | ||
300 | "dm_table_remove_device calls missing"); | ||
301 | |||
302 | free_devices(&t->devices); | 262 | free_devices(&t->devices); |
303 | } | 263 | |
264 | dm_free_md_mempools(t->mempools); | ||
304 | 265 | ||
305 | kfree(t); | 266 | kfree(t); |
306 | } | 267 | } |
@@ -384,15 +345,48 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) | |||
384 | /* | 345 | /* |
385 | * If possible, this checks an area of a destination device is valid. | 346 | * If possible, this checks an area of a destination device is valid. |
386 | */ | 347 | */ |
387 | static int check_device_area(struct dm_dev_internal *dd, sector_t start, | 348 | static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev, |
388 | sector_t len) | 349 | sector_t start, void *data) |
389 | { | 350 | { |
390 | sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT; | 351 | struct queue_limits *limits = data; |
352 | struct block_device *bdev = dev->bdev; | ||
353 | sector_t dev_size = | ||
354 | i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; | ||
355 | unsigned short logical_block_size_sectors = | ||
356 | limits->logical_block_size >> SECTOR_SHIFT; | ||
357 | char b[BDEVNAME_SIZE]; | ||
391 | 358 | ||
392 | if (!dev_size) | 359 | if (!dev_size) |
393 | return 1; | 360 | return 1; |
394 | 361 | ||
395 | return ((start < dev_size) && (len <= (dev_size - start))); | 362 | if ((start >= dev_size) || (start + ti->len > dev_size)) { |
363 | DMWARN("%s: %s too small for target", | ||
364 | dm_device_name(ti->table->md), bdevname(bdev, b)); | ||
365 | return 0; | ||
366 | } | ||
367 | |||
368 | if (logical_block_size_sectors <= 1) | ||
369 | return 1; | ||
370 | |||
371 | if (start & (logical_block_size_sectors - 1)) { | ||
372 | DMWARN("%s: start=%llu not aligned to h/w " | ||
373 | "logical block size %hu of %s", | ||
374 | dm_device_name(ti->table->md), | ||
375 | (unsigned long long)start, | ||
376 | limits->logical_block_size, bdevname(bdev, b)); | ||
377 | return 0; | ||
378 | } | ||
379 | |||
380 | if (ti->len & (logical_block_size_sectors - 1)) { | ||
381 | DMWARN("%s: len=%llu not aligned to h/w " | ||
382 | "logical block size %hu of %s", | ||
383 | dm_device_name(ti->table->md), | ||
384 | (unsigned long long)ti->len, | ||
385 | limits->logical_block_size, bdevname(bdev, b)); | ||
386 | return 0; | ||
387 | } | ||
388 | |||
389 | return 1; | ||
396 | } | 390 | } |
397 | 391 | ||
398 | /* | 392 | /* |
@@ -478,38 +472,32 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, | |||
478 | } | 472 | } |
479 | atomic_inc(&dd->count); | 473 | atomic_inc(&dd->count); |
480 | 474 | ||
481 | if (!check_device_area(dd, start, len)) { | ||
482 | DMWARN("device %s too small for target", path); | ||
483 | dm_put_device(ti, &dd->dm_dev); | ||
484 | return -EINVAL; | ||
485 | } | ||
486 | |||
487 | *result = &dd->dm_dev; | 475 | *result = &dd->dm_dev; |
488 | |||
489 | return 0; | 476 | return 0; |
490 | } | 477 | } |
491 | 478 | ||
492 | void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) | 479 | /* |
480 | * Returns the minimum that is _not_ zero, unless both are zero. | ||
481 | */ | ||
482 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
483 | |||
484 | int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | ||
485 | sector_t start, void *data) | ||
493 | { | 486 | { |
487 | struct queue_limits *limits = data; | ||
488 | struct block_device *bdev = dev->bdev; | ||
494 | struct request_queue *q = bdev_get_queue(bdev); | 489 | struct request_queue *q = bdev_get_queue(bdev); |
495 | struct io_restrictions *rs = &ti->limits; | ||
496 | char b[BDEVNAME_SIZE]; | 490 | char b[BDEVNAME_SIZE]; |
497 | 491 | ||
498 | if (unlikely(!q)) { | 492 | if (unlikely(!q)) { |
499 | DMWARN("%s: Cannot set limits for nonexistent device %s", | 493 | DMWARN("%s: Cannot set limits for nonexistent device %s", |
500 | dm_device_name(ti->table->md), bdevname(bdev, b)); | 494 | dm_device_name(ti->table->md), bdevname(bdev, b)); |
501 | return; | 495 | return 0; |
502 | } | 496 | } |
503 | 497 | ||
504 | /* | 498 | if (blk_stack_limits(limits, &q->limits, start) < 0) |
505 | * Combine the device limits low. | 499 | DMWARN("%s: target device %s is misaligned", |
506 | * | 500 | dm_device_name(ti->table->md), bdevname(bdev, b)); |
507 | * FIXME: if we move an io_restriction struct | ||
508 | * into q this would just be a call to | ||
509 | * combine_restrictions_low() | ||
510 | */ | ||
511 | rs->max_sectors = | ||
512 | min_not_zero(rs->max_sectors, q->max_sectors); | ||
513 | 501 | ||
514 | /* | 502 | /* |
515 | * Check if merge fn is supported. | 503 | * Check if merge fn is supported. |
@@ -518,47 +506,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) | |||
518 | */ | 506 | */ |
519 | 507 | ||
520 | if (q->merge_bvec_fn && !ti->type->merge) | 508 | if (q->merge_bvec_fn && !ti->type->merge) |
521 | rs->max_sectors = | 509 | limits->max_sectors = |
522 | min_not_zero(rs->max_sectors, | 510 | min_not_zero(limits->max_sectors, |
523 | (unsigned int) (PAGE_SIZE >> 9)); | 511 | (unsigned int) (PAGE_SIZE >> 9)); |
524 | 512 | return 0; | |
525 | rs->max_phys_segments = | ||
526 | min_not_zero(rs->max_phys_segments, | ||
527 | q->max_phys_segments); | ||
528 | |||
529 | rs->max_hw_segments = | ||
530 | min_not_zero(rs->max_hw_segments, q->max_hw_segments); | ||
531 | |||
532 | rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); | ||
533 | |||
534 | rs->max_segment_size = | ||
535 | min_not_zero(rs->max_segment_size, q->max_segment_size); | ||
536 | |||
537 | rs->max_hw_sectors = | ||
538 | min_not_zero(rs->max_hw_sectors, q->max_hw_sectors); | ||
539 | |||
540 | rs->seg_boundary_mask = | ||
541 | min_not_zero(rs->seg_boundary_mask, | ||
542 | q->seg_boundary_mask); | ||
543 | |||
544 | rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn); | ||
545 | |||
546 | rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); | ||
547 | } | 513 | } |
548 | EXPORT_SYMBOL_GPL(dm_set_device_limits); | 514 | EXPORT_SYMBOL_GPL(dm_set_device_limits); |
549 | 515 | ||
550 | int dm_get_device(struct dm_target *ti, const char *path, sector_t start, | 516 | int dm_get_device(struct dm_target *ti, const char *path, sector_t start, |
551 | sector_t len, fmode_t mode, struct dm_dev **result) | 517 | sector_t len, fmode_t mode, struct dm_dev **result) |
552 | { | 518 | { |
553 | int r = __table_get_device(ti->table, ti, path, | 519 | return __table_get_device(ti->table, ti, path, |
554 | start, len, mode, result); | 520 | start, len, mode, result); |
555 | |||
556 | if (!r) | ||
557 | dm_set_device_limits(ti, (*result)->bdev); | ||
558 | |||
559 | return r; | ||
560 | } | 521 | } |
561 | 522 | ||
523 | |||
562 | /* | 524 | /* |
563 | * Decrement a devices use count and remove it if necessary. | 525 | * Decrement a devices use count and remove it if necessary. |
564 | */ | 526 | */ |
@@ -673,24 +635,78 @@ int dm_split_args(int *argc, char ***argvp, char *input) | |||
673 | return 0; | 635 | return 0; |
674 | } | 636 | } |
675 | 637 | ||
676 | static void check_for_valid_limits(struct io_restrictions *rs) | 638 | /* |
639 | * Impose necessary and sufficient conditions on a devices's table such | ||
640 | * that any incoming bio which respects its logical_block_size can be | ||
641 | * processed successfully. If it falls across the boundary between | ||
642 | * two or more targets, the size of each piece it gets split into must | ||
643 | * be compatible with the logical_block_size of the target processing it. | ||
644 | */ | ||
645 | static int validate_hardware_logical_block_alignment(struct dm_table *table, | ||
646 | struct queue_limits *limits) | ||
677 | { | 647 | { |
678 | if (!rs->max_sectors) | 648 | /* |
679 | rs->max_sectors = SAFE_MAX_SECTORS; | 649 | * This function uses arithmetic modulo the logical_block_size |
680 | if (!rs->max_hw_sectors) | 650 | * (in units of 512-byte sectors). |
681 | rs->max_hw_sectors = SAFE_MAX_SECTORS; | 651 | */ |
682 | if (!rs->max_phys_segments) | 652 | unsigned short device_logical_block_size_sects = |
683 | rs->max_phys_segments = MAX_PHYS_SEGMENTS; | 653 | limits->logical_block_size >> SECTOR_SHIFT; |
684 | if (!rs->max_hw_segments) | 654 | |
685 | rs->max_hw_segments = MAX_HW_SEGMENTS; | 655 | /* |
686 | if (!rs->hardsect_size) | 656 | * Offset of the start of the next table entry, mod logical_block_size. |
687 | rs->hardsect_size = 1 << SECTOR_SHIFT; | 657 | */ |
688 | if (!rs->max_segment_size) | 658 | unsigned short next_target_start = 0; |
689 | rs->max_segment_size = MAX_SEGMENT_SIZE; | 659 | |
690 | if (!rs->seg_boundary_mask) | 660 | /* |
691 | rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; | 661 | * Given an aligned bio that extends beyond the end of a |
692 | if (!rs->bounce_pfn) | 662 | * target, how many sectors must the next target handle? |
693 | rs->bounce_pfn = -1; | 663 | */ |
664 | unsigned short remaining = 0; | ||
665 | |||
666 | struct dm_target *uninitialized_var(ti); | ||
667 | struct queue_limits ti_limits; | ||
668 | unsigned i = 0; | ||
669 | |||
670 | /* | ||
671 | * Check each entry in the table in turn. | ||
672 | */ | ||
673 | while (i < dm_table_get_num_targets(table)) { | ||
674 | ti = dm_table_get_target(table, i++); | ||
675 | |||
676 | blk_set_default_limits(&ti_limits); | ||
677 | |||
678 | /* combine all target devices' limits */ | ||
679 | if (ti->type->iterate_devices) | ||
680 | ti->type->iterate_devices(ti, dm_set_device_limits, | ||
681 | &ti_limits); | ||
682 | |||
683 | /* | ||
684 | * If the remaining sectors fall entirely within this | ||
685 | * table entry are they compatible with its logical_block_size? | ||
686 | */ | ||
687 | if (remaining < ti->len && | ||
688 | remaining & ((ti_limits.logical_block_size >> | ||
689 | SECTOR_SHIFT) - 1)) | ||
690 | break; /* Error */ | ||
691 | |||
692 | next_target_start = | ||
693 | (unsigned short) ((next_target_start + ti->len) & | ||
694 | (device_logical_block_size_sects - 1)); | ||
695 | remaining = next_target_start ? | ||
696 | device_logical_block_size_sects - next_target_start : 0; | ||
697 | } | ||
698 | |||
699 | if (remaining) { | ||
700 | DMWARN("%s: table line %u (start sect %llu len %llu) " | ||
701 | "not aligned to h/w logical block size %hu", | ||
702 | dm_device_name(table->md), i, | ||
703 | (unsigned long long) ti->begin, | ||
704 | (unsigned long long) ti->len, | ||
705 | limits->logical_block_size); | ||
706 | return -EINVAL; | ||
707 | } | ||
708 | |||
709 | return 0; | ||
694 | } | 710 | } |
695 | 711 | ||
696 | int dm_table_add_target(struct dm_table *t, const char *type, | 712 | int dm_table_add_target(struct dm_table *t, const char *type, |
@@ -745,9 +761,6 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
745 | 761 | ||
746 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; | 762 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; |
747 | 763 | ||
748 | /* FIXME: the plan is to combine high here and then have | ||
749 | * the merge fn apply the target level restrictions. */ | ||
750 | combine_restrictions_low(&t->limits, &tgt->limits); | ||
751 | return 0; | 764 | return 0; |
752 | 765 | ||
753 | bad: | 766 | bad: |
@@ -756,6 +769,104 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
756 | return r; | 769 | return r; |
757 | } | 770 | } |
758 | 771 | ||
772 | int dm_table_set_type(struct dm_table *t) | ||
773 | { | ||
774 | unsigned i; | ||
775 | unsigned bio_based = 0, request_based = 0; | ||
776 | struct dm_target *tgt; | ||
777 | struct dm_dev_internal *dd; | ||
778 | struct list_head *devices; | ||
779 | |||
780 | for (i = 0; i < t->num_targets; i++) { | ||
781 | tgt = t->targets + i; | ||
782 | if (dm_target_request_based(tgt)) | ||
783 | request_based = 1; | ||
784 | else | ||
785 | bio_based = 1; | ||
786 | |||
787 | if (bio_based && request_based) { | ||
788 | DMWARN("Inconsistent table: different target types" | ||
789 | " can't be mixed up"); | ||
790 | return -EINVAL; | ||
791 | } | ||
792 | } | ||
793 | |||
794 | if (bio_based) { | ||
795 | /* We must use this table as bio-based */ | ||
796 | t->type = DM_TYPE_BIO_BASED; | ||
797 | return 0; | ||
798 | } | ||
799 | |||
800 | BUG_ON(!request_based); /* No targets in this table */ | ||
801 | |||
802 | /* Non-request-stackable devices can't be used for request-based dm */ | ||
803 | devices = dm_table_get_devices(t); | ||
804 | list_for_each_entry(dd, devices, list) { | ||
805 | if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { | ||
806 | DMWARN("table load rejected: including" | ||
807 | " non-request-stackable devices"); | ||
808 | return -EINVAL; | ||
809 | } | ||
810 | } | ||
811 | |||
812 | /* | ||
813 | * Request-based dm supports only tables that have a single target now. | ||
814 | * To support multiple targets, request splitting support is needed, | ||
815 | * and that needs lots of changes in the block-layer. | ||
816 | * (e.g. request completion process for partial completion.) | ||
817 | */ | ||
818 | if (t->num_targets > 1) { | ||
819 | DMWARN("Request-based dm doesn't support multiple targets yet"); | ||
820 | return -EINVAL; | ||
821 | } | ||
822 | |||
823 | t->type = DM_TYPE_REQUEST_BASED; | ||
824 | |||
825 | return 0; | ||
826 | } | ||
827 | |||
828 | unsigned dm_table_get_type(struct dm_table *t) | ||
829 | { | ||
830 | return t->type; | ||
831 | } | ||
832 | |||
833 | bool dm_table_bio_based(struct dm_table *t) | ||
834 | { | ||
835 | return dm_table_get_type(t) == DM_TYPE_BIO_BASED; | ||
836 | } | ||
837 | |||
838 | bool dm_table_request_based(struct dm_table *t) | ||
839 | { | ||
840 | return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; | ||
841 | } | ||
842 | |||
843 | int dm_table_alloc_md_mempools(struct dm_table *t) | ||
844 | { | ||
845 | unsigned type = dm_table_get_type(t); | ||
846 | |||
847 | if (unlikely(type == DM_TYPE_NONE)) { | ||
848 | DMWARN("no table type is set, can't allocate mempools"); | ||
849 | return -EINVAL; | ||
850 | } | ||
851 | |||
852 | t->mempools = dm_alloc_md_mempools(type); | ||
853 | if (!t->mempools) | ||
854 | return -ENOMEM; | ||
855 | |||
856 | return 0; | ||
857 | } | ||
858 | |||
859 | void dm_table_free_md_mempools(struct dm_table *t) | ||
860 | { | ||
861 | dm_free_md_mempools(t->mempools); | ||
862 | t->mempools = NULL; | ||
863 | } | ||
864 | |||
865 | struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) | ||
866 | { | ||
867 | return t->mempools; | ||
868 | } | ||
869 | |||
759 | static int setup_indexes(struct dm_table *t) | 870 | static int setup_indexes(struct dm_table *t) |
760 | { | 871 | { |
761 | int i; | 872 | int i; |
@@ -790,8 +901,6 @@ int dm_table_complete(struct dm_table *t) | |||
790 | int r = 0; | 901 | int r = 0; |
791 | unsigned int leaf_nodes; | 902 | unsigned int leaf_nodes; |
792 | 903 | ||
793 | check_for_valid_limits(&t->limits); | ||
794 | |||
795 | /* how many indexes will the btree have ? */ | 904 | /* how many indexes will the btree have ? */ |
796 | leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); | 905 | leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); |
797 | t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); | 906 | t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); |
@@ -867,6 +976,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) | |||
867 | } | 976 | } |
868 | 977 | ||
869 | /* | 978 | /* |
979 | * Establish the new table's queue_limits and validate them. | ||
980 | */ | ||
981 | int dm_calculate_queue_limits(struct dm_table *table, | ||
982 | struct queue_limits *limits) | ||
983 | { | ||
984 | struct dm_target *uninitialized_var(ti); | ||
985 | struct queue_limits ti_limits; | ||
986 | unsigned i = 0; | ||
987 | |||
988 | blk_set_default_limits(limits); | ||
989 | |||
990 | while (i < dm_table_get_num_targets(table)) { | ||
991 | blk_set_default_limits(&ti_limits); | ||
992 | |||
993 | ti = dm_table_get_target(table, i++); | ||
994 | |||
995 | if (!ti->type->iterate_devices) | ||
996 | goto combine_limits; | ||
997 | |||
998 | /* | ||
999 | * Combine queue limits of all the devices this target uses. | ||
1000 | */ | ||
1001 | ti->type->iterate_devices(ti, dm_set_device_limits, | ||
1002 | &ti_limits); | ||
1003 | |||
1004 | /* | ||
1005 | * Check each device area is consistent with the target's | ||
1006 | * overall queue limits. | ||
1007 | */ | ||
1008 | if (!ti->type->iterate_devices(ti, device_area_is_valid, | ||
1009 | &ti_limits)) | ||
1010 | return -EINVAL; | ||
1011 | |||
1012 | combine_limits: | ||
1013 | /* | ||
1014 | * Merge this target's queue limits into the overall limits | ||
1015 | * for the table. | ||
1016 | */ | ||
1017 | if (blk_stack_limits(limits, &ti_limits, 0) < 0) | ||
1018 | DMWARN("%s: target device " | ||
1019 | "(start sect %llu len %llu) " | ||
1020 | "is misaligned", | ||
1021 | dm_device_name(table->md), | ||
1022 | (unsigned long long) ti->begin, | ||
1023 | (unsigned long long) ti->len); | ||
1024 | } | ||
1025 | |||
1026 | return validate_hardware_logical_block_alignment(table, limits); | ||
1027 | } | ||
1028 | |||
1029 | /* | ||
870 | * Set the integrity profile for this device if all devices used have | 1030 | * Set the integrity profile for this device if all devices used have |
871 | * matching profiles. | 1031 | * matching profiles. |
872 | */ | 1032 | */ |
@@ -905,27 +1065,42 @@ no_integrity: | |||
905 | return; | 1065 | return; |
906 | } | 1066 | } |
907 | 1067 | ||
908 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) | 1068 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, |
1069 | struct queue_limits *limits) | ||
909 | { | 1070 | { |
910 | /* | 1071 | /* |
911 | * Make sure we obey the optimistic sub devices | 1072 | * Each target device in the table has a data area that should normally |
912 | * restrictions. | 1073 | * be aligned such that the DM device's alignment_offset is 0. |
1074 | * FIXME: Propagate alignment_offsets up the stack and warn of | ||
1075 | * sub-optimal or inconsistent settings. | ||
1076 | */ | ||
1077 | limits->alignment_offset = 0; | ||
1078 | limits->misaligned = 0; | ||
1079 | |||
1080 | /* | ||
1081 | * Copy table's limits to the DM device's request_queue | ||
913 | */ | 1082 | */ |
914 | blk_queue_max_sectors(q, t->limits.max_sectors); | 1083 | q->limits = *limits; |
915 | q->max_phys_segments = t->limits.max_phys_segments; | 1084 | |
916 | q->max_hw_segments = t->limits.max_hw_segments; | 1085 | if (limits->no_cluster) |
917 | q->hardsect_size = t->limits.hardsect_size; | ||
918 | q->max_segment_size = t->limits.max_segment_size; | ||
919 | q->max_hw_sectors = t->limits.max_hw_sectors; | ||
920 | q->seg_boundary_mask = t->limits.seg_boundary_mask; | ||
921 | q->bounce_pfn = t->limits.bounce_pfn; | ||
922 | |||
923 | if (t->limits.no_cluster) | ||
924 | queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); | 1086 | queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); |
925 | else | 1087 | else |
926 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); | 1088 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); |
927 | 1089 | ||
928 | dm_table_set_integrity(t); | 1090 | dm_table_set_integrity(t); |
1091 | |||
1092 | /* | ||
1093 | * QUEUE_FLAG_STACKABLE must be set after all queue settings are | ||
1094 | * visible to other CPUs because, once the flag is set, incoming bios | ||
1095 | * are processed by request-based dm, which refers to the queue | ||
1096 | * settings. | ||
1097 | * Until the flag set, bios are passed to bio-based dm and queued to | ||
1098 | * md->deferred where queue settings are not needed yet. | ||
1099 | * Those bios are passed to request-based dm at the resume time. | ||
1100 | */ | ||
1101 | smp_mb(); | ||
1102 | if (dm_table_request_based(t)) | ||
1103 | queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); | ||
929 | } | 1104 | } |
930 | 1105 | ||
931 | unsigned int dm_table_get_num_targets(struct dm_table *t) | 1106 | unsigned int dm_table_get_num_targets(struct dm_table *t) |
@@ -1021,6 +1196,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) | |||
1021 | return r; | 1196 | return r; |
1022 | } | 1197 | } |
1023 | 1198 | ||
1199 | int dm_table_any_busy_target(struct dm_table *t) | ||
1200 | { | ||
1201 | unsigned i; | ||
1202 | struct dm_target *ti; | ||
1203 | |||
1204 | for (i = 0; i < t->num_targets; i++) { | ||
1205 | ti = t->targets + i; | ||
1206 | if (ti->type->busy && ti->type->busy(ti)) | ||
1207 | return 1; | ||
1208 | } | ||
1209 | |||
1210 | return 0; | ||
1211 | } | ||
1212 | |||
1024 | void dm_table_unplug_all(struct dm_table *t) | 1213 | void dm_table_unplug_all(struct dm_table *t) |
1025 | { | 1214 | { |
1026 | struct dm_dev_internal *dd; | 1215 | struct dm_dev_internal *dd; |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 424f7b048c30..3c6d4ee8921d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -19,11 +19,18 @@ | |||
19 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
20 | #include <linux/idr.h> | 20 | #include <linux/idr.h> |
21 | #include <linux/hdreg.h> | 21 | #include <linux/hdreg.h> |
22 | #include <linux/blktrace_api.h> | 22 | |
23 | #include <trace/block.h> | 23 | #include <trace/events/block.h> |
24 | 24 | ||
25 | #define DM_MSG_PREFIX "core" | 25 | #define DM_MSG_PREFIX "core" |
26 | 26 | ||
27 | /* | ||
28 | * Cookies are numeric values sent with CHANGE and REMOVE | ||
29 | * uevents while resuming, removing or renaming the device. | ||
30 | */ | ||
31 | #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" | ||
32 | #define DM_COOKIE_LENGTH 24 | ||
33 | |||
27 | static const char *_name = DM_NAME; | 34 | static const char *_name = DM_NAME; |
28 | 35 | ||
29 | static unsigned int major = 0; | 36 | static unsigned int major = 0; |
@@ -53,8 +60,6 @@ struct dm_target_io { | |||
53 | union map_info info; | 60 | union map_info info; |
54 | }; | 61 | }; |
55 | 62 | ||
56 | DEFINE_TRACE(block_bio_complete); | ||
57 | |||
58 | /* | 63 | /* |
59 | * For request-based dm. | 64 | * For request-based dm. |
60 | * One of these is allocated per request. | 65 | * One of these is allocated per request. |
@@ -73,7 +78,7 @@ struct dm_rq_target_io { | |||
73 | */ | 78 | */ |
74 | struct dm_rq_clone_bio_info { | 79 | struct dm_rq_clone_bio_info { |
75 | struct bio *orig; | 80 | struct bio *orig; |
76 | struct request *rq; | 81 | struct dm_rq_target_io *tio; |
77 | }; | 82 | }; |
78 | 83 | ||
79 | union map_info *dm_get_mapinfo(struct bio *bio) | 84 | union map_info *dm_get_mapinfo(struct bio *bio) |
@@ -83,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio) | |||
83 | return NULL; | 88 | return NULL; |
84 | } | 89 | } |
85 | 90 | ||
91 | union map_info *dm_get_rq_mapinfo(struct request *rq) | ||
92 | { | ||
93 | if (rq && rq->end_io_data) | ||
94 | return &((struct dm_rq_target_io *)rq->end_io_data)->info; | ||
95 | return NULL; | ||
96 | } | ||
97 | EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); | ||
98 | |||
86 | #define MINOR_ALLOCED ((void *)-1) | 99 | #define MINOR_ALLOCED ((void *)-1) |
87 | 100 | ||
88 | /* | 101 | /* |
@@ -159,13 +172,31 @@ struct mapped_device { | |||
159 | * freeze/thaw support require holding onto a super block | 172 | * freeze/thaw support require holding onto a super block |
160 | */ | 173 | */ |
161 | struct super_block *frozen_sb; | 174 | struct super_block *frozen_sb; |
162 | struct block_device *suspended_bdev; | 175 | struct block_device *bdev; |
163 | 176 | ||
164 | /* forced geometry settings */ | 177 | /* forced geometry settings */ |
165 | struct hd_geometry geometry; | 178 | struct hd_geometry geometry; |
166 | 179 | ||
180 | /* marker of flush suspend for request-based dm */ | ||
181 | struct request suspend_rq; | ||
182 | |||
183 | /* For saving the address of __make_request for request based dm */ | ||
184 | make_request_fn *saved_make_request_fn; | ||
185 | |||
167 | /* sysfs handle */ | 186 | /* sysfs handle */ |
168 | struct kobject kobj; | 187 | struct kobject kobj; |
188 | |||
189 | /* zero-length barrier that will be cloned and submitted to targets */ | ||
190 | struct bio barrier_bio; | ||
191 | }; | ||
192 | |||
193 | /* | ||
194 | * For mempools pre-allocation at the table loading time. | ||
195 | */ | ||
196 | struct dm_md_mempools { | ||
197 | mempool_t *io_pool; | ||
198 | mempool_t *tio_pool; | ||
199 | struct bio_set *bs; | ||
169 | }; | 200 | }; |
170 | 201 | ||
171 | #define MIN_IOS 256 | 202 | #define MIN_IOS 256 |
@@ -393,14 +424,29 @@ static void free_io(struct mapped_device *md, struct dm_io *io) | |||
393 | mempool_free(io, md->io_pool); | 424 | mempool_free(io, md->io_pool); |
394 | } | 425 | } |
395 | 426 | ||
396 | static struct dm_target_io *alloc_tio(struct mapped_device *md) | 427 | static void free_tio(struct mapped_device *md, struct dm_target_io *tio) |
428 | { | ||
429 | mempool_free(tio, md->tio_pool); | ||
430 | } | ||
431 | |||
432 | static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) | ||
397 | { | 433 | { |
398 | return mempool_alloc(md->tio_pool, GFP_NOIO); | 434 | return mempool_alloc(md->tio_pool, GFP_ATOMIC); |
399 | } | 435 | } |
400 | 436 | ||
401 | static void free_tio(struct mapped_device *md, struct dm_target_io *tio) | 437 | static void free_rq_tio(struct dm_rq_target_io *tio) |
402 | { | 438 | { |
403 | mempool_free(tio, md->tio_pool); | 439 | mempool_free(tio, tio->md->tio_pool); |
440 | } | ||
441 | |||
442 | static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) | ||
443 | { | ||
444 | return mempool_alloc(md->io_pool, GFP_ATOMIC); | ||
445 | } | ||
446 | |||
447 | static void free_bio_info(struct dm_rq_clone_bio_info *info) | ||
448 | { | ||
449 | mempool_free(info, info->tio->md->io_pool); | ||
404 | } | 450 | } |
405 | 451 | ||
406 | static void start_io_acct(struct dm_io *io) | 452 | static void start_io_acct(struct dm_io *io) |
@@ -466,12 +512,13 @@ static void queue_io(struct mapped_device *md, struct bio *bio) | |||
466 | struct dm_table *dm_get_table(struct mapped_device *md) | 512 | struct dm_table *dm_get_table(struct mapped_device *md) |
467 | { | 513 | { |
468 | struct dm_table *t; | 514 | struct dm_table *t; |
515 | unsigned long flags; | ||
469 | 516 | ||
470 | read_lock(&md->map_lock); | 517 | read_lock_irqsave(&md->map_lock, flags); |
471 | t = md->map; | 518 | t = md->map; |
472 | if (t) | 519 | if (t) |
473 | dm_table_get(t); | 520 | dm_table_get(t); |
474 | read_unlock(&md->map_lock); | 521 | read_unlock_irqrestore(&md->map_lock, flags); |
475 | 522 | ||
476 | return t; | 523 | return t; |
477 | } | 524 | } |
@@ -538,9 +585,11 @@ static void dec_pending(struct dm_io *io, int error) | |||
538 | * Target requested pushing back the I/O. | 585 | * Target requested pushing back the I/O. |
539 | */ | 586 | */ |
540 | spin_lock_irqsave(&md->deferred_lock, flags); | 587 | spin_lock_irqsave(&md->deferred_lock, flags); |
541 | if (__noflush_suspending(md)) | 588 | if (__noflush_suspending(md)) { |
542 | bio_list_add_head(&md->deferred, io->bio); | 589 | if (!bio_barrier(io->bio)) |
543 | else | 590 | bio_list_add_head(&md->deferred, |
591 | io->bio); | ||
592 | } else | ||
544 | /* noflush suspend was interrupted. */ | 593 | /* noflush suspend was interrupted. */ |
545 | io->error = -EIO; | 594 | io->error = -EIO; |
546 | spin_unlock_irqrestore(&md->deferred_lock, flags); | 595 | spin_unlock_irqrestore(&md->deferred_lock, flags); |
@@ -555,7 +604,8 @@ static void dec_pending(struct dm_io *io, int error) | |||
555 | * a per-device variable for error reporting. | 604 | * a per-device variable for error reporting. |
556 | * Note that you can't touch the bio after end_io_acct | 605 | * Note that you can't touch the bio after end_io_acct |
557 | */ | 606 | */ |
558 | md->barrier_error = io_error; | 607 | if (!md->barrier_error && io_error != -EOPNOTSUPP) |
608 | md->barrier_error = io_error; | ||
559 | end_io_acct(io); | 609 | end_io_acct(io); |
560 | } else { | 610 | } else { |
561 | end_io_acct(io); | 611 | end_io_acct(io); |
@@ -609,6 +659,262 @@ static void clone_endio(struct bio *bio, int error) | |||
609 | dec_pending(io, error); | 659 | dec_pending(io, error); |
610 | } | 660 | } |
611 | 661 | ||
662 | /* | ||
663 | * Partial completion handling for request-based dm | ||
664 | */ | ||
665 | static void end_clone_bio(struct bio *clone, int error) | ||
666 | { | ||
667 | struct dm_rq_clone_bio_info *info = clone->bi_private; | ||
668 | struct dm_rq_target_io *tio = info->tio; | ||
669 | struct bio *bio = info->orig; | ||
670 | unsigned int nr_bytes = info->orig->bi_size; | ||
671 | |||
672 | bio_put(clone); | ||
673 | |||
674 | if (tio->error) | ||
675 | /* | ||
676 | * An error has already been detected on the request. | ||
677 | * Once error occurred, just let clone->end_io() handle | ||
678 | * the remainder. | ||
679 | */ | ||
680 | return; | ||
681 | else if (error) { | ||
682 | /* | ||
683 | * Don't notice the error to the upper layer yet. | ||
684 | * The error handling decision is made by the target driver, | ||
685 | * when the request is completed. | ||
686 | */ | ||
687 | tio->error = error; | ||
688 | return; | ||
689 | } | ||
690 | |||
691 | /* | ||
692 | * I/O for the bio successfully completed. | ||
693 | * Notice the data completion to the upper layer. | ||
694 | */ | ||
695 | |||
696 | /* | ||
697 | * bios are processed from the head of the list. | ||
698 | * So the completing bio should always be rq->bio. | ||
699 | * If it's not, something wrong is happening. | ||
700 | */ | ||
701 | if (tio->orig->bio != bio) | ||
702 | DMERR("bio completion is going in the middle of the request"); | ||
703 | |||
704 | /* | ||
705 | * Update the original request. | ||
706 | * Do not use blk_end_request() here, because it may complete | ||
707 | * the original request before the clone, and break the ordering. | ||
708 | */ | ||
709 | blk_update_request(tio->orig, 0, nr_bytes); | ||
710 | } | ||
711 | |||
712 | /* | ||
713 | * Don't touch any member of the md after calling this function because | ||
714 | * the md may be freed in dm_put() at the end of this function. | ||
715 | * Or do dm_get() before calling this function and dm_put() later. | ||
716 | */ | ||
717 | static void rq_completed(struct mapped_device *md, int run_queue) | ||
718 | { | ||
719 | int wakeup_waiters = 0; | ||
720 | struct request_queue *q = md->queue; | ||
721 | unsigned long flags; | ||
722 | |||
723 | spin_lock_irqsave(q->queue_lock, flags); | ||
724 | if (!queue_in_flight(q)) | ||
725 | wakeup_waiters = 1; | ||
726 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
727 | |||
728 | /* nudge anyone waiting on suspend queue */ | ||
729 | if (wakeup_waiters) | ||
730 | wake_up(&md->wait); | ||
731 | |||
732 | if (run_queue) | ||
733 | blk_run_queue(q); | ||
734 | |||
735 | /* | ||
736 | * dm_put() must be at the end of this function. See the comment above | ||
737 | */ | ||
738 | dm_put(md); | ||
739 | } | ||
740 | |||
741 | static void dm_unprep_request(struct request *rq) | ||
742 | { | ||
743 | struct request *clone = rq->special; | ||
744 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
745 | |||
746 | rq->special = NULL; | ||
747 | rq->cmd_flags &= ~REQ_DONTPREP; | ||
748 | |||
749 | blk_rq_unprep_clone(clone); | ||
750 | free_rq_tio(tio); | ||
751 | } | ||
752 | |||
753 | /* | ||
754 | * Requeue the original request of a clone. | ||
755 | */ | ||
756 | void dm_requeue_unmapped_request(struct request *clone) | ||
757 | { | ||
758 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
759 | struct mapped_device *md = tio->md; | ||
760 | struct request *rq = tio->orig; | ||
761 | struct request_queue *q = rq->q; | ||
762 | unsigned long flags; | ||
763 | |||
764 | dm_unprep_request(rq); | ||
765 | |||
766 | spin_lock_irqsave(q->queue_lock, flags); | ||
767 | if (elv_queue_empty(q)) | ||
768 | blk_plug_device(q); | ||
769 | blk_requeue_request(q, rq); | ||
770 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
771 | |||
772 | rq_completed(md, 0); | ||
773 | } | ||
774 | EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); | ||
775 | |||
776 | static void __stop_queue(struct request_queue *q) | ||
777 | { | ||
778 | blk_stop_queue(q); | ||
779 | } | ||
780 | |||
781 | static void stop_queue(struct request_queue *q) | ||
782 | { | ||
783 | unsigned long flags; | ||
784 | |||
785 | spin_lock_irqsave(q->queue_lock, flags); | ||
786 | __stop_queue(q); | ||
787 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
788 | } | ||
789 | |||
790 | static void __start_queue(struct request_queue *q) | ||
791 | { | ||
792 | if (blk_queue_stopped(q)) | ||
793 | blk_start_queue(q); | ||
794 | } | ||
795 | |||
796 | static void start_queue(struct request_queue *q) | ||
797 | { | ||
798 | unsigned long flags; | ||
799 | |||
800 | spin_lock_irqsave(q->queue_lock, flags); | ||
801 | __start_queue(q); | ||
802 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
803 | } | ||
804 | |||
805 | /* | ||
806 | * Complete the clone and the original request. | ||
807 | * Must be called without queue lock. | ||
808 | */ | ||
809 | static void dm_end_request(struct request *clone, int error) | ||
810 | { | ||
811 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
812 | struct mapped_device *md = tio->md; | ||
813 | struct request *rq = tio->orig; | ||
814 | |||
815 | if (blk_pc_request(rq)) { | ||
816 | rq->errors = clone->errors; | ||
817 | rq->resid_len = clone->resid_len; | ||
818 | |||
819 | if (rq->sense) | ||
820 | /* | ||
821 | * We are using the sense buffer of the original | ||
822 | * request. | ||
823 | * So setting the length of the sense data is enough. | ||
824 | */ | ||
825 | rq->sense_len = clone->sense_len; | ||
826 | } | ||
827 | |||
828 | BUG_ON(clone->bio); | ||
829 | free_rq_tio(tio); | ||
830 | |||
831 | blk_end_request_all(rq, error); | ||
832 | |||
833 | rq_completed(md, 1); | ||
834 | } | ||
835 | |||
836 | /* | ||
837 | * Request completion handler for request-based dm | ||
838 | */ | ||
839 | static void dm_softirq_done(struct request *rq) | ||
840 | { | ||
841 | struct request *clone = rq->completion_data; | ||
842 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
843 | dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; | ||
844 | int error = tio->error; | ||
845 | |||
846 | if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) | ||
847 | error = rq_end_io(tio->ti, clone, error, &tio->info); | ||
848 | |||
849 | if (error <= 0) | ||
850 | /* The target wants to complete the I/O */ | ||
851 | dm_end_request(clone, error); | ||
852 | else if (error == DM_ENDIO_INCOMPLETE) | ||
853 | /* The target will handle the I/O */ | ||
854 | return; | ||
855 | else if (error == DM_ENDIO_REQUEUE) | ||
856 | /* The target wants to requeue the I/O */ | ||
857 | dm_requeue_unmapped_request(clone); | ||
858 | else { | ||
859 | DMWARN("unimplemented target endio return value: %d", error); | ||
860 | BUG(); | ||
861 | } | ||
862 | } | ||
863 | |||
864 | /* | ||
865 | * Complete the clone and the original request with the error status | ||
866 | * through softirq context. | ||
867 | */ | ||
868 | static void dm_complete_request(struct request *clone, int error) | ||
869 | { | ||
870 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
871 | struct request *rq = tio->orig; | ||
872 | |||
873 | tio->error = error; | ||
874 | rq->completion_data = clone; | ||
875 | blk_complete_request(rq); | ||
876 | } | ||
877 | |||
878 | /* | ||
879 | * Complete the not-mapped clone and the original request with the error status | ||
880 | * through softirq context. | ||
881 | * Target's rq_end_io() function isn't called. | ||
882 | * This may be used when the target's map_rq() function fails. | ||
883 | */ | ||
884 | void dm_kill_unmapped_request(struct request *clone, int error) | ||
885 | { | ||
886 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
887 | struct request *rq = tio->orig; | ||
888 | |||
889 | rq->cmd_flags |= REQ_FAILED; | ||
890 | dm_complete_request(clone, error); | ||
891 | } | ||
892 | EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); | ||
893 | |||
894 | /* | ||
895 | * Called with the queue lock held | ||
896 | */ | ||
897 | static void end_clone_request(struct request *clone, int error) | ||
898 | { | ||
899 | /* | ||
900 | * For just cleaning up the information of the queue in which | ||
901 | * the clone was dispatched. | ||
902 | * The clone is *NOT* freed actually here because it is alloced from | ||
903 | * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. | ||
904 | */ | ||
905 | __blk_put_request(clone->q, clone); | ||
906 | |||
907 | /* | ||
908 | * Actual request completion is done in a softirq context which doesn't | ||
909 | * hold the queue lock. Otherwise, deadlock could occur because: | ||
910 | * - another request may be submitted by the upper level driver | ||
911 | * of the stacking during the completion | ||
912 | * - the submission which requires queue lock may be done | ||
913 | * against this queue | ||
914 | */ | ||
915 | dm_complete_request(clone, error); | ||
916 | } | ||
917 | |||
612 | static sector_t max_io_len(struct mapped_device *md, | 918 | static sector_t max_io_len(struct mapped_device *md, |
613 | sector_t sector, struct dm_target *ti) | 919 | sector_t sector, struct dm_target *ti) |
614 | { | 920 | { |
@@ -636,11 +942,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, | |||
636 | sector_t sector; | 942 | sector_t sector; |
637 | struct mapped_device *md; | 943 | struct mapped_device *md; |
638 | 944 | ||
639 | /* | ||
640 | * Sanity checks. | ||
641 | */ | ||
642 | BUG_ON(!clone->bi_size); | ||
643 | |||
644 | clone->bi_end_io = clone_endio; | 945 | clone->bi_end_io = clone_endio; |
645 | clone->bi_private = tio; | 946 | clone->bi_private = tio; |
646 | 947 | ||
@@ -656,8 +957,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, | |||
656 | /* the bio has been remapped so dispatch it */ | 957 | /* the bio has been remapped so dispatch it */ |
657 | 958 | ||
658 | trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, | 959 | trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, |
659 | tio->io->bio->bi_bdev->bd_dev, | 960 | tio->io->bio->bi_bdev->bd_dev, sector); |
660 | clone->bi_sector, sector); | ||
661 | 961 | ||
662 | generic_make_request(clone); | 962 | generic_make_request(clone); |
663 | } else if (r < 0 || r == DM_MAPIO_REQUEUE) { | 963 | } else if (r < 0 || r == DM_MAPIO_REQUEUE) { |
@@ -755,6 +1055,48 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, | |||
755 | return clone; | 1055 | return clone; |
756 | } | 1056 | } |
757 | 1057 | ||
1058 | static struct dm_target_io *alloc_tio(struct clone_info *ci, | ||
1059 | struct dm_target *ti) | ||
1060 | { | ||
1061 | struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); | ||
1062 | |||
1063 | tio->io = ci->io; | ||
1064 | tio->ti = ti; | ||
1065 | memset(&tio->info, 0, sizeof(tio->info)); | ||
1066 | |||
1067 | return tio; | ||
1068 | } | ||
1069 | |||
1070 | static void __flush_target(struct clone_info *ci, struct dm_target *ti, | ||
1071 | unsigned flush_nr) | ||
1072 | { | ||
1073 | struct dm_target_io *tio = alloc_tio(ci, ti); | ||
1074 | struct bio *clone; | ||
1075 | |||
1076 | tio->info.flush_request = flush_nr; | ||
1077 | |||
1078 | clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); | ||
1079 | __bio_clone(clone, ci->bio); | ||
1080 | clone->bi_destructor = dm_bio_destructor; | ||
1081 | |||
1082 | __map_bio(ti, clone, tio); | ||
1083 | } | ||
1084 | |||
1085 | static int __clone_and_map_empty_barrier(struct clone_info *ci) | ||
1086 | { | ||
1087 | unsigned target_nr = 0, flush_nr; | ||
1088 | struct dm_target *ti; | ||
1089 | |||
1090 | while ((ti = dm_table_get_target(ci->map, target_nr++))) | ||
1091 | for (flush_nr = 0; flush_nr < ti->num_flush_requests; | ||
1092 | flush_nr++) | ||
1093 | __flush_target(ci, ti, flush_nr); | ||
1094 | |||
1095 | ci->sector_count = 0; | ||
1096 | |||
1097 | return 0; | ||
1098 | } | ||
1099 | |||
758 | static int __clone_and_map(struct clone_info *ci) | 1100 | static int __clone_and_map(struct clone_info *ci) |
759 | { | 1101 | { |
760 | struct bio *clone, *bio = ci->bio; | 1102 | struct bio *clone, *bio = ci->bio; |
@@ -762,6 +1104,9 @@ static int __clone_and_map(struct clone_info *ci) | |||
762 | sector_t len = 0, max; | 1104 | sector_t len = 0, max; |
763 | struct dm_target_io *tio; | 1105 | struct dm_target_io *tio; |
764 | 1106 | ||
1107 | if (unlikely(bio_empty_barrier(bio))) | ||
1108 | return __clone_and_map_empty_barrier(ci); | ||
1109 | |||
765 | ti = dm_table_find_target(ci->map, ci->sector); | 1110 | ti = dm_table_find_target(ci->map, ci->sector); |
766 | if (!dm_target_is_valid(ti)) | 1111 | if (!dm_target_is_valid(ti)) |
767 | return -EIO; | 1112 | return -EIO; |
@@ -771,10 +1116,7 @@ static int __clone_and_map(struct clone_info *ci) | |||
771 | /* | 1116 | /* |
772 | * Allocate a target io object. | 1117 | * Allocate a target io object. |
773 | */ | 1118 | */ |
774 | tio = alloc_tio(ci->md); | 1119 | tio = alloc_tio(ci, ti); |
775 | tio->io = ci->io; | ||
776 | tio->ti = ti; | ||
777 | memset(&tio->info, 0, sizeof(tio->info)); | ||
778 | 1120 | ||
779 | if (ci->sector_count <= max) { | 1121 | if (ci->sector_count <= max) { |
780 | /* | 1122 | /* |
@@ -830,10 +1172,7 @@ static int __clone_and_map(struct clone_info *ci) | |||
830 | 1172 | ||
831 | max = max_io_len(ci->md, ci->sector, ti); | 1173 | max = max_io_len(ci->md, ci->sector, ti); |
832 | 1174 | ||
833 | tio = alloc_tio(ci->md); | 1175 | tio = alloc_tio(ci, ti); |
834 | tio->io = ci->io; | ||
835 | tio->ti = ti; | ||
836 | memset(&tio->info, 0, sizeof(tio->info)); | ||
837 | } | 1176 | } |
838 | 1177 | ||
839 | len = min(remaining, max); | 1178 | len = min(remaining, max); |
@@ -868,7 +1207,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
868 | if (!bio_barrier(bio)) | 1207 | if (!bio_barrier(bio)) |
869 | bio_io_error(bio); | 1208 | bio_io_error(bio); |
870 | else | 1209 | else |
871 | md->barrier_error = -EIO; | 1210 | if (!md->barrier_error) |
1211 | md->barrier_error = -EIO; | ||
872 | return; | 1212 | return; |
873 | } | 1213 | } |
874 | 1214 | ||
@@ -881,6 +1221,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
881 | ci.io->md = md; | 1221 | ci.io->md = md; |
882 | ci.sector = bio->bi_sector; | 1222 | ci.sector = bio->bi_sector; |
883 | ci.sector_count = bio_sectors(bio); | 1223 | ci.sector_count = bio_sectors(bio); |
1224 | if (unlikely(bio_empty_barrier(bio))) | ||
1225 | ci.sector_count = 1; | ||
884 | ci.idx = bio->bi_idx; | 1226 | ci.idx = bio->bi_idx; |
885 | 1227 | ||
886 | start_io_acct(ci.io); | 1228 | start_io_acct(ci.io); |
@@ -928,6 +1270,16 @@ static int dm_merge_bvec(struct request_queue *q, | |||
928 | */ | 1270 | */ |
929 | if (max_size && ti->type->merge) | 1271 | if (max_size && ti->type->merge) |
930 | max_size = ti->type->merge(ti, bvm, biovec, max_size); | 1272 | max_size = ti->type->merge(ti, bvm, biovec, max_size); |
1273 | /* | ||
1274 | * If the target doesn't support merge method and some of the devices | ||
1275 | * provided their merge_bvec method (we know this by looking at | ||
1276 | * queue_max_hw_sectors), then we can't allow bios with multiple vector | ||
1277 | * entries. So always set max_size to 0, and the code below allows | ||
1278 | * just one page. | ||
1279 | */ | ||
1280 | else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) | ||
1281 | |||
1282 | max_size = 0; | ||
931 | 1283 | ||
932 | out_table: | 1284 | out_table: |
933 | dm_table_put(map); | 1285 | dm_table_put(map); |
@@ -946,7 +1298,7 @@ out: | |||
946 | * The request function that just remaps the bio built up by | 1298 | * The request function that just remaps the bio built up by |
947 | * dm_merge_bvec. | 1299 | * dm_merge_bvec. |
948 | */ | 1300 | */ |
949 | static int dm_request(struct request_queue *q, struct bio *bio) | 1301 | static int _dm_request(struct request_queue *q, struct bio *bio) |
950 | { | 1302 | { |
951 | int rw = bio_data_dir(bio); | 1303 | int rw = bio_data_dir(bio); |
952 | struct mapped_device *md = q->queuedata; | 1304 | struct mapped_device *md = q->queuedata; |
@@ -983,12 +1335,274 @@ static int dm_request(struct request_queue *q, struct bio *bio) | |||
983 | return 0; | 1335 | return 0; |
984 | } | 1336 | } |
985 | 1337 | ||
1338 | static int dm_make_request(struct request_queue *q, struct bio *bio) | ||
1339 | { | ||
1340 | struct mapped_device *md = q->queuedata; | ||
1341 | |||
1342 | if (unlikely(bio_barrier(bio))) { | ||
1343 | bio_endio(bio, -EOPNOTSUPP); | ||
1344 | return 0; | ||
1345 | } | ||
1346 | |||
1347 | return md->saved_make_request_fn(q, bio); /* call __make_request() */ | ||
1348 | } | ||
1349 | |||
1350 | static int dm_request_based(struct mapped_device *md) | ||
1351 | { | ||
1352 | return blk_queue_stackable(md->queue); | ||
1353 | } | ||
1354 | |||
1355 | static int dm_request(struct request_queue *q, struct bio *bio) | ||
1356 | { | ||
1357 | struct mapped_device *md = q->queuedata; | ||
1358 | |||
1359 | if (dm_request_based(md)) | ||
1360 | return dm_make_request(q, bio); | ||
1361 | |||
1362 | return _dm_request(q, bio); | ||
1363 | } | ||
1364 | |||
1365 | void dm_dispatch_request(struct request *rq) | ||
1366 | { | ||
1367 | int r; | ||
1368 | |||
1369 | if (blk_queue_io_stat(rq->q)) | ||
1370 | rq->cmd_flags |= REQ_IO_STAT; | ||
1371 | |||
1372 | rq->start_time = jiffies; | ||
1373 | r = blk_insert_cloned_request(rq->q, rq); | ||
1374 | if (r) | ||
1375 | dm_complete_request(rq, r); | ||
1376 | } | ||
1377 | EXPORT_SYMBOL_GPL(dm_dispatch_request); | ||
1378 | |||
1379 | static void dm_rq_bio_destructor(struct bio *bio) | ||
1380 | { | ||
1381 | struct dm_rq_clone_bio_info *info = bio->bi_private; | ||
1382 | struct mapped_device *md = info->tio->md; | ||
1383 | |||
1384 | free_bio_info(info); | ||
1385 | bio_free(bio, md->bs); | ||
1386 | } | ||
1387 | |||
1388 | static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, | ||
1389 | void *data) | ||
1390 | { | ||
1391 | struct dm_rq_target_io *tio = data; | ||
1392 | struct mapped_device *md = tio->md; | ||
1393 | struct dm_rq_clone_bio_info *info = alloc_bio_info(md); | ||
1394 | |||
1395 | if (!info) | ||
1396 | return -ENOMEM; | ||
1397 | |||
1398 | info->orig = bio_orig; | ||
1399 | info->tio = tio; | ||
1400 | bio->bi_end_io = end_clone_bio; | ||
1401 | bio->bi_private = info; | ||
1402 | bio->bi_destructor = dm_rq_bio_destructor; | ||
1403 | |||
1404 | return 0; | ||
1405 | } | ||
1406 | |||
1407 | static int setup_clone(struct request *clone, struct request *rq, | ||
1408 | struct dm_rq_target_io *tio) | ||
1409 | { | ||
1410 | int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, | ||
1411 | dm_rq_bio_constructor, tio); | ||
1412 | |||
1413 | if (r) | ||
1414 | return r; | ||
1415 | |||
1416 | clone->cmd = rq->cmd; | ||
1417 | clone->cmd_len = rq->cmd_len; | ||
1418 | clone->sense = rq->sense; | ||
1419 | clone->buffer = rq->buffer; | ||
1420 | clone->end_io = end_clone_request; | ||
1421 | clone->end_io_data = tio; | ||
1422 | |||
1423 | return 0; | ||
1424 | } | ||
1425 | |||
1426 | static int dm_rq_flush_suspending(struct mapped_device *md) | ||
1427 | { | ||
1428 | return !md->suspend_rq.special; | ||
1429 | } | ||
1430 | |||
1431 | /* | ||
1432 | * Called with the queue lock held. | ||
1433 | */ | ||
1434 | static int dm_prep_fn(struct request_queue *q, struct request *rq) | ||
1435 | { | ||
1436 | struct mapped_device *md = q->queuedata; | ||
1437 | struct dm_rq_target_io *tio; | ||
1438 | struct request *clone; | ||
1439 | |||
1440 | if (unlikely(rq == &md->suspend_rq)) { | ||
1441 | if (dm_rq_flush_suspending(md)) | ||
1442 | return BLKPREP_OK; | ||
1443 | else | ||
1444 | /* The flush suspend was interrupted */ | ||
1445 | return BLKPREP_KILL; | ||
1446 | } | ||
1447 | |||
1448 | if (unlikely(rq->special)) { | ||
1449 | DMWARN("Already has something in rq->special."); | ||
1450 | return BLKPREP_KILL; | ||
1451 | } | ||
1452 | |||
1453 | tio = alloc_rq_tio(md); /* Only one for each original request */ | ||
1454 | if (!tio) | ||
1455 | /* -ENOMEM */ | ||
1456 | return BLKPREP_DEFER; | ||
1457 | |||
1458 | tio->md = md; | ||
1459 | tio->ti = NULL; | ||
1460 | tio->orig = rq; | ||
1461 | tio->error = 0; | ||
1462 | memset(&tio->info, 0, sizeof(tio->info)); | ||
1463 | |||
1464 | clone = &tio->clone; | ||
1465 | if (setup_clone(clone, rq, tio)) { | ||
1466 | /* -ENOMEM */ | ||
1467 | free_rq_tio(tio); | ||
1468 | return BLKPREP_DEFER; | ||
1469 | } | ||
1470 | |||
1471 | rq->special = clone; | ||
1472 | rq->cmd_flags |= REQ_DONTPREP; | ||
1473 | |||
1474 | return BLKPREP_OK; | ||
1475 | } | ||
1476 | |||
1477 | static void map_request(struct dm_target *ti, struct request *rq, | ||
1478 | struct mapped_device *md) | ||
1479 | { | ||
1480 | int r; | ||
1481 | struct request *clone = rq->special; | ||
1482 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
1483 | |||
1484 | /* | ||
1485 | * Hold the md reference here for the in-flight I/O. | ||
1486 | * We can't rely on the reference count by device opener, | ||
1487 | * because the device may be closed during the request completion | ||
1488 | * when all bios are completed. | ||
1489 | * See the comment in rq_completed() too. | ||
1490 | */ | ||
1491 | dm_get(md); | ||
1492 | |||
1493 | tio->ti = ti; | ||
1494 | r = ti->type->map_rq(ti, clone, &tio->info); | ||
1495 | switch (r) { | ||
1496 | case DM_MAPIO_SUBMITTED: | ||
1497 | /* The target has taken the I/O to submit by itself later */ | ||
1498 | break; | ||
1499 | case DM_MAPIO_REMAPPED: | ||
1500 | /* The target has remapped the I/O so dispatch it */ | ||
1501 | dm_dispatch_request(clone); | ||
1502 | break; | ||
1503 | case DM_MAPIO_REQUEUE: | ||
1504 | /* The target wants to requeue the I/O */ | ||
1505 | dm_requeue_unmapped_request(clone); | ||
1506 | break; | ||
1507 | default: | ||
1508 | if (r > 0) { | ||
1509 | DMWARN("unimplemented target map return value: %d", r); | ||
1510 | BUG(); | ||
1511 | } | ||
1512 | |||
1513 | /* The target wants to complete the I/O */ | ||
1514 | dm_kill_unmapped_request(clone, r); | ||
1515 | break; | ||
1516 | } | ||
1517 | } | ||
1518 | |||
1519 | /* | ||
1520 | * q->request_fn for request-based dm. | ||
1521 | * Called with the queue lock held. | ||
1522 | */ | ||
1523 | static void dm_request_fn(struct request_queue *q) | ||
1524 | { | ||
1525 | struct mapped_device *md = q->queuedata; | ||
1526 | struct dm_table *map = dm_get_table(md); | ||
1527 | struct dm_target *ti; | ||
1528 | struct request *rq; | ||
1529 | |||
1530 | /* | ||
1531 | * For noflush suspend, check blk_queue_stopped() to immediately | ||
1532 | * quit I/O dispatching. | ||
1533 | */ | ||
1534 | while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { | ||
1535 | rq = blk_peek_request(q); | ||
1536 | if (!rq) | ||
1537 | goto plug_and_out; | ||
1538 | |||
1539 | if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ | ||
1540 | if (queue_in_flight(q)) | ||
1541 | /* Not quiet yet. Wait more */ | ||
1542 | goto plug_and_out; | ||
1543 | |||
1544 | /* This device should be quiet now */ | ||
1545 | __stop_queue(q); | ||
1546 | blk_start_request(rq); | ||
1547 | __blk_end_request_all(rq, 0); | ||
1548 | wake_up(&md->wait); | ||
1549 | goto out; | ||
1550 | } | ||
1551 | |||
1552 | ti = dm_table_find_target(map, blk_rq_pos(rq)); | ||
1553 | if (ti->type->busy && ti->type->busy(ti)) | ||
1554 | goto plug_and_out; | ||
1555 | |||
1556 | blk_start_request(rq); | ||
1557 | spin_unlock(q->queue_lock); | ||
1558 | map_request(ti, rq, md); | ||
1559 | spin_lock_irq(q->queue_lock); | ||
1560 | } | ||
1561 | |||
1562 | goto out; | ||
1563 | |||
1564 | plug_and_out: | ||
1565 | if (!elv_queue_empty(q)) | ||
1566 | /* Some requests still remain, retry later */ | ||
1567 | blk_plug_device(q); | ||
1568 | |||
1569 | out: | ||
1570 | dm_table_put(map); | ||
1571 | |||
1572 | return; | ||
1573 | } | ||
1574 | |||
1575 | int dm_underlying_device_busy(struct request_queue *q) | ||
1576 | { | ||
1577 | return blk_lld_busy(q); | ||
1578 | } | ||
1579 | EXPORT_SYMBOL_GPL(dm_underlying_device_busy); | ||
1580 | |||
1581 | static int dm_lld_busy(struct request_queue *q) | ||
1582 | { | ||
1583 | int r; | ||
1584 | struct mapped_device *md = q->queuedata; | ||
1585 | struct dm_table *map = dm_get_table(md); | ||
1586 | |||
1587 | if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) | ||
1588 | r = 1; | ||
1589 | else | ||
1590 | r = dm_table_any_busy_target(map); | ||
1591 | |||
1592 | dm_table_put(map); | ||
1593 | |||
1594 | return r; | ||
1595 | } | ||
1596 | |||
986 | static void dm_unplug_all(struct request_queue *q) | 1597 | static void dm_unplug_all(struct request_queue *q) |
987 | { | 1598 | { |
988 | struct mapped_device *md = q->queuedata; | 1599 | struct mapped_device *md = q->queuedata; |
989 | struct dm_table *map = dm_get_table(md); | 1600 | struct dm_table *map = dm_get_table(md); |
990 | 1601 | ||
991 | if (map) { | 1602 | if (map) { |
1603 | if (dm_request_based(md)) | ||
1604 | generic_unplug_device(q); | ||
1605 | |||
992 | dm_table_unplug_all(map); | 1606 | dm_table_unplug_all(map); |
993 | dm_table_put(map); | 1607 | dm_table_put(map); |
994 | } | 1608 | } |
@@ -1003,7 +1617,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits) | |||
1003 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { | 1617 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { |
1004 | map = dm_get_table(md); | 1618 | map = dm_get_table(md); |
1005 | if (map) { | 1619 | if (map) { |
1006 | r = dm_table_any_congested(map, bdi_bits); | 1620 | /* |
1621 | * Request-based dm cares about only own queue for | ||
1622 | * the query about congestion status of request_queue | ||
1623 | */ | ||
1624 | if (dm_request_based(md)) | ||
1625 | r = md->queue->backing_dev_info.state & | ||
1626 | bdi_bits; | ||
1627 | else | ||
1628 | r = dm_table_any_congested(map, bdi_bits); | ||
1629 | |||
1007 | dm_table_put(map); | 1630 | dm_table_put(map); |
1008 | } | 1631 | } |
1009 | } | 1632 | } |
@@ -1126,30 +1749,32 @@ static struct mapped_device *alloc_dev(int minor) | |||
1126 | INIT_LIST_HEAD(&md->uevent_list); | 1749 | INIT_LIST_HEAD(&md->uevent_list); |
1127 | spin_lock_init(&md->uevent_lock); | 1750 | spin_lock_init(&md->uevent_lock); |
1128 | 1751 | ||
1129 | md->queue = blk_alloc_queue(GFP_KERNEL); | 1752 | md->queue = blk_init_queue(dm_request_fn, NULL); |
1130 | if (!md->queue) | 1753 | if (!md->queue) |
1131 | goto bad_queue; | 1754 | goto bad_queue; |
1132 | 1755 | ||
1756 | /* | ||
1757 | * Request-based dm devices cannot be stacked on top of bio-based dm | ||
1758 | * devices. The type of this dm device has not been decided yet, | ||
1759 | * although we initialized the queue using blk_init_queue(). | ||
1760 | * The type is decided at the first table loading time. | ||
1761 | * To prevent problematic device stacking, clear the queue flag | ||
1762 | * for request stacking support until then. | ||
1763 | * | ||
1764 | * This queue is new, so no concurrency on the queue_flags. | ||
1765 | */ | ||
1766 | queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); | ||
1767 | md->saved_make_request_fn = md->queue->make_request_fn; | ||
1133 | md->queue->queuedata = md; | 1768 | md->queue->queuedata = md; |
1134 | md->queue->backing_dev_info.congested_fn = dm_any_congested; | 1769 | md->queue->backing_dev_info.congested_fn = dm_any_congested; |
1135 | md->queue->backing_dev_info.congested_data = md; | 1770 | md->queue->backing_dev_info.congested_data = md; |
1136 | blk_queue_make_request(md->queue, dm_request); | 1771 | blk_queue_make_request(md->queue, dm_request); |
1137 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); | ||
1138 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); | 1772 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); |
1139 | md->queue->unplug_fn = dm_unplug_all; | 1773 | md->queue->unplug_fn = dm_unplug_all; |
1140 | blk_queue_merge_bvec(md->queue, dm_merge_bvec); | 1774 | blk_queue_merge_bvec(md->queue, dm_merge_bvec); |
1141 | 1775 | blk_queue_softirq_done(md->queue, dm_softirq_done); | |
1142 | md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); | 1776 | blk_queue_prep_rq(md->queue, dm_prep_fn); |
1143 | if (!md->io_pool) | 1777 | blk_queue_lld_busy(md->queue, dm_lld_busy); |
1144 | goto bad_io_pool; | ||
1145 | |||
1146 | md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); | ||
1147 | if (!md->tio_pool) | ||
1148 | goto bad_tio_pool; | ||
1149 | |||
1150 | md->bs = bioset_create(16, 0); | ||
1151 | if (!md->bs) | ||
1152 | goto bad_no_bioset; | ||
1153 | 1778 | ||
1154 | md->disk = alloc_disk(1); | 1779 | md->disk = alloc_disk(1); |
1155 | if (!md->disk) | 1780 | if (!md->disk) |
@@ -1173,6 +1798,10 @@ static struct mapped_device *alloc_dev(int minor) | |||
1173 | if (!md->wq) | 1798 | if (!md->wq) |
1174 | goto bad_thread; | 1799 | goto bad_thread; |
1175 | 1800 | ||
1801 | md->bdev = bdget_disk(md->disk, 0); | ||
1802 | if (!md->bdev) | ||
1803 | goto bad_bdev; | ||
1804 | |||
1176 | /* Populate the mapping, nobody knows we exist yet */ | 1805 | /* Populate the mapping, nobody knows we exist yet */ |
1177 | spin_lock(&_minor_lock); | 1806 | spin_lock(&_minor_lock); |
1178 | old_md = idr_replace(&_minor_idr, md, minor); | 1807 | old_md = idr_replace(&_minor_idr, md, minor); |
@@ -1182,15 +1811,11 @@ static struct mapped_device *alloc_dev(int minor) | |||
1182 | 1811 | ||
1183 | return md; | 1812 | return md; |
1184 | 1813 | ||
1814 | bad_bdev: | ||
1815 | destroy_workqueue(md->wq); | ||
1185 | bad_thread: | 1816 | bad_thread: |
1186 | put_disk(md->disk); | 1817 | put_disk(md->disk); |
1187 | bad_disk: | 1818 | bad_disk: |
1188 | bioset_free(md->bs); | ||
1189 | bad_no_bioset: | ||
1190 | mempool_destroy(md->tio_pool); | ||
1191 | bad_tio_pool: | ||
1192 | mempool_destroy(md->io_pool); | ||
1193 | bad_io_pool: | ||
1194 | blk_cleanup_queue(md->queue); | 1819 | blk_cleanup_queue(md->queue); |
1195 | bad_queue: | 1820 | bad_queue: |
1196 | free_minor(minor); | 1821 | free_minor(minor); |
@@ -1207,14 +1832,15 @@ static void free_dev(struct mapped_device *md) | |||
1207 | { | 1832 | { |
1208 | int minor = MINOR(disk_devt(md->disk)); | 1833 | int minor = MINOR(disk_devt(md->disk)); |
1209 | 1834 | ||
1210 | if (md->suspended_bdev) { | 1835 | unlock_fs(md); |
1211 | unlock_fs(md); | 1836 | bdput(md->bdev); |
1212 | bdput(md->suspended_bdev); | ||
1213 | } | ||
1214 | destroy_workqueue(md->wq); | 1837 | destroy_workqueue(md->wq); |
1215 | mempool_destroy(md->tio_pool); | 1838 | if (md->tio_pool) |
1216 | mempool_destroy(md->io_pool); | 1839 | mempool_destroy(md->tio_pool); |
1217 | bioset_free(md->bs); | 1840 | if (md->io_pool) |
1841 | mempool_destroy(md->io_pool); | ||
1842 | if (md->bs) | ||
1843 | bioset_free(md->bs); | ||
1218 | blk_integrity_unregister(md->disk); | 1844 | blk_integrity_unregister(md->disk); |
1219 | del_gendisk(md->disk); | 1845 | del_gendisk(md->disk); |
1220 | free_minor(minor); | 1846 | free_minor(minor); |
@@ -1229,6 +1855,29 @@ static void free_dev(struct mapped_device *md) | |||
1229 | kfree(md); | 1855 | kfree(md); |
1230 | } | 1856 | } |
1231 | 1857 | ||
1858 | static void __bind_mempools(struct mapped_device *md, struct dm_table *t) | ||
1859 | { | ||
1860 | struct dm_md_mempools *p; | ||
1861 | |||
1862 | if (md->io_pool && md->tio_pool && md->bs) | ||
1863 | /* the md already has necessary mempools */ | ||
1864 | goto out; | ||
1865 | |||
1866 | p = dm_table_get_md_mempools(t); | ||
1867 | BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); | ||
1868 | |||
1869 | md->io_pool = p->io_pool; | ||
1870 | p->io_pool = NULL; | ||
1871 | md->tio_pool = p->tio_pool; | ||
1872 | p->tio_pool = NULL; | ||
1873 | md->bs = p->bs; | ||
1874 | p->bs = NULL; | ||
1875 | |||
1876 | out: | ||
1877 | /* mempool bind completed, now no need any mempools in the table */ | ||
1878 | dm_table_free_md_mempools(t); | ||
1879 | } | ||
1880 | |||
1232 | /* | 1881 | /* |
1233 | * Bind a table to the device. | 1882 | * Bind a table to the device. |
1234 | */ | 1883 | */ |
@@ -1252,15 +1901,17 @@ static void __set_size(struct mapped_device *md, sector_t size) | |||
1252 | { | 1901 | { |
1253 | set_capacity(md->disk, size); | 1902 | set_capacity(md->disk, size); |
1254 | 1903 | ||
1255 | mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); | 1904 | mutex_lock(&md->bdev->bd_inode->i_mutex); |
1256 | i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); | 1905 | i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); |
1257 | mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); | 1906 | mutex_unlock(&md->bdev->bd_inode->i_mutex); |
1258 | } | 1907 | } |
1259 | 1908 | ||
1260 | static int __bind(struct mapped_device *md, struct dm_table *t) | 1909 | static int __bind(struct mapped_device *md, struct dm_table *t, |
1910 | struct queue_limits *limits) | ||
1261 | { | 1911 | { |
1262 | struct request_queue *q = md->queue; | 1912 | struct request_queue *q = md->queue; |
1263 | sector_t size; | 1913 | sector_t size; |
1914 | unsigned long flags; | ||
1264 | 1915 | ||
1265 | size = dm_table_get_size(t); | 1916 | size = dm_table_get_size(t); |
1266 | 1917 | ||
@@ -1270,8 +1921,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t) | |||
1270 | if (size != get_capacity(md->disk)) | 1921 | if (size != get_capacity(md->disk)) |
1271 | memset(&md->geometry, 0, sizeof(md->geometry)); | 1922 | memset(&md->geometry, 0, sizeof(md->geometry)); |
1272 | 1923 | ||
1273 | if (md->suspended_bdev) | 1924 | __set_size(md, size); |
1274 | __set_size(md, size); | ||
1275 | 1925 | ||
1276 | if (!size) { | 1926 | if (!size) { |
1277 | dm_table_destroy(t); | 1927 | dm_table_destroy(t); |
@@ -1280,10 +1930,22 @@ static int __bind(struct mapped_device *md, struct dm_table *t) | |||
1280 | 1930 | ||
1281 | dm_table_event_callback(t, event_callback, md); | 1931 | dm_table_event_callback(t, event_callback, md); |
1282 | 1932 | ||
1283 | write_lock(&md->map_lock); | 1933 | /* |
1934 | * The queue hasn't been stopped yet, if the old table type wasn't | ||
1935 | * for request-based during suspension. So stop it to prevent | ||
1936 | * I/O mapping before resume. | ||
1937 | * This must be done before setting the queue restrictions, | ||
1938 | * because request-based dm may be run just after the setting. | ||
1939 | */ | ||
1940 | if (dm_table_request_based(t) && !blk_queue_stopped(q)) | ||
1941 | stop_queue(q); | ||
1942 | |||
1943 | __bind_mempools(md, t); | ||
1944 | |||
1945 | write_lock_irqsave(&md->map_lock, flags); | ||
1284 | md->map = t; | 1946 | md->map = t; |
1285 | dm_table_set_restrictions(t, q); | 1947 | dm_table_set_restrictions(t, q, limits); |
1286 | write_unlock(&md->map_lock); | 1948 | write_unlock_irqrestore(&md->map_lock, flags); |
1287 | 1949 | ||
1288 | return 0; | 1950 | return 0; |
1289 | } | 1951 | } |
@@ -1291,14 +1953,15 @@ static int __bind(struct mapped_device *md, struct dm_table *t) | |||
1291 | static void __unbind(struct mapped_device *md) | 1953 | static void __unbind(struct mapped_device *md) |
1292 | { | 1954 | { |
1293 | struct dm_table *map = md->map; | 1955 | struct dm_table *map = md->map; |
1956 | unsigned long flags; | ||
1294 | 1957 | ||
1295 | if (!map) | 1958 | if (!map) |
1296 | return; | 1959 | return; |
1297 | 1960 | ||
1298 | dm_table_event_callback(map, NULL, NULL); | 1961 | dm_table_event_callback(map, NULL, NULL); |
1299 | write_lock(&md->map_lock); | 1962 | write_lock_irqsave(&md->map_lock, flags); |
1300 | md->map = NULL; | 1963 | md->map = NULL; |
1301 | write_unlock(&md->map_lock); | 1964 | write_unlock_irqrestore(&md->map_lock, flags); |
1302 | dm_table_destroy(map); | 1965 | dm_table_destroy(map); |
1303 | } | 1966 | } |
1304 | 1967 | ||
@@ -1402,6 +2065,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
1402 | { | 2065 | { |
1403 | int r = 0; | 2066 | int r = 0; |
1404 | DECLARE_WAITQUEUE(wait, current); | 2067 | DECLARE_WAITQUEUE(wait, current); |
2068 | struct request_queue *q = md->queue; | ||
2069 | unsigned long flags; | ||
1405 | 2070 | ||
1406 | dm_unplug_all(md->queue); | 2071 | dm_unplug_all(md->queue); |
1407 | 2072 | ||
@@ -1411,7 +2076,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
1411 | set_current_state(interruptible); | 2076 | set_current_state(interruptible); |
1412 | 2077 | ||
1413 | smp_mb(); | 2078 | smp_mb(); |
1414 | if (!atomic_read(&md->pending)) | 2079 | if (dm_request_based(md)) { |
2080 | spin_lock_irqsave(q->queue_lock, flags); | ||
2081 | if (!queue_in_flight(q) && blk_queue_stopped(q)) { | ||
2082 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2083 | break; | ||
2084 | } | ||
2085 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2086 | } else if (!atomic_read(&md->pending)) | ||
1415 | break; | 2087 | break; |
1416 | 2088 | ||
1417 | if (interruptible == TASK_INTERRUPTIBLE && | 2089 | if (interruptible == TASK_INTERRUPTIBLE && |
@@ -1429,34 +2101,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
1429 | return r; | 2101 | return r; |
1430 | } | 2102 | } |
1431 | 2103 | ||
1432 | static int dm_flush(struct mapped_device *md) | 2104 | static void dm_flush(struct mapped_device *md) |
1433 | { | 2105 | { |
1434 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | 2106 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); |
1435 | return 0; | 2107 | |
2108 | bio_init(&md->barrier_bio); | ||
2109 | md->barrier_bio.bi_bdev = md->bdev; | ||
2110 | md->barrier_bio.bi_rw = WRITE_BARRIER; | ||
2111 | __split_and_process_bio(md, &md->barrier_bio); | ||
2112 | |||
2113 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | ||
1436 | } | 2114 | } |
1437 | 2115 | ||
1438 | static void process_barrier(struct mapped_device *md, struct bio *bio) | 2116 | static void process_barrier(struct mapped_device *md, struct bio *bio) |
1439 | { | 2117 | { |
1440 | int error = dm_flush(md); | 2118 | md->barrier_error = 0; |
1441 | 2119 | ||
1442 | if (unlikely(error)) { | 2120 | dm_flush(md); |
1443 | bio_endio(bio, error); | ||
1444 | return; | ||
1445 | } | ||
1446 | if (bio_empty_barrier(bio)) { | ||
1447 | bio_endio(bio, 0); | ||
1448 | return; | ||
1449 | } | ||
1450 | |||
1451 | __split_and_process_bio(md, bio); | ||
1452 | 2121 | ||
1453 | error = dm_flush(md); | 2122 | if (!bio_empty_barrier(bio)) { |
1454 | 2123 | __split_and_process_bio(md, bio); | |
1455 | if (!error && md->barrier_error) | 2124 | dm_flush(md); |
1456 | error = md->barrier_error; | 2125 | } |
1457 | 2126 | ||
1458 | if (md->barrier_error != DM_ENDIO_REQUEUE) | 2127 | if (md->barrier_error != DM_ENDIO_REQUEUE) |
1459 | bio_endio(bio, error); | 2128 | bio_endio(bio, md->barrier_error); |
2129 | else { | ||
2130 | spin_lock_irq(&md->deferred_lock); | ||
2131 | bio_list_add_head(&md->deferred, bio); | ||
2132 | spin_unlock_irq(&md->deferred_lock); | ||
2133 | } | ||
1460 | } | 2134 | } |
1461 | 2135 | ||
1462 | /* | 2136 | /* |
@@ -1482,10 +2156,14 @@ static void dm_wq_work(struct work_struct *work) | |||
1482 | 2156 | ||
1483 | up_write(&md->io_lock); | 2157 | up_write(&md->io_lock); |
1484 | 2158 | ||
1485 | if (bio_barrier(c)) | 2159 | if (dm_request_based(md)) |
1486 | process_barrier(md, c); | 2160 | generic_make_request(c); |
1487 | else | 2161 | else { |
1488 | __split_and_process_bio(md, c); | 2162 | if (bio_barrier(c)) |
2163 | process_barrier(md, c); | ||
2164 | else | ||
2165 | __split_and_process_bio(md, c); | ||
2166 | } | ||
1489 | 2167 | ||
1490 | down_write(&md->io_lock); | 2168 | down_write(&md->io_lock); |
1491 | } | 2169 | } |
@@ -1505,6 +2183,7 @@ static void dm_queue_flush(struct mapped_device *md) | |||
1505 | */ | 2183 | */ |
1506 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) | 2184 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) |
1507 | { | 2185 | { |
2186 | struct queue_limits limits; | ||
1508 | int r = -EINVAL; | 2187 | int r = -EINVAL; |
1509 | 2188 | ||
1510 | mutex_lock(&md->suspend_lock); | 2189 | mutex_lock(&md->suspend_lock); |
@@ -1513,19 +2192,96 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) | |||
1513 | if (!dm_suspended(md)) | 2192 | if (!dm_suspended(md)) |
1514 | goto out; | 2193 | goto out; |
1515 | 2194 | ||
1516 | /* without bdev, the device size cannot be changed */ | 2195 | r = dm_calculate_queue_limits(table, &limits); |
1517 | if (!md->suspended_bdev) | 2196 | if (r) |
1518 | if (get_capacity(md->disk) != dm_table_get_size(table)) | 2197 | goto out; |
1519 | goto out; | 2198 | |
2199 | /* cannot change the device type, once a table is bound */ | ||
2200 | if (md->map && | ||
2201 | (dm_table_get_type(md->map) != dm_table_get_type(table))) { | ||
2202 | DMWARN("can't change the device type after a table is bound"); | ||
2203 | goto out; | ||
2204 | } | ||
2205 | |||
2206 | /* | ||
2207 | * It is enought that blk_queue_ordered() is called only once when | ||
2208 | * the first bio-based table is bound. | ||
2209 | * | ||
2210 | * This setting should be moved to alloc_dev() when request-based dm | ||
2211 | * supports barrier. | ||
2212 | */ | ||
2213 | if (!md->map && dm_table_bio_based(table)) | ||
2214 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); | ||
1520 | 2215 | ||
1521 | __unbind(md); | 2216 | __unbind(md); |
1522 | r = __bind(md, table); | 2217 | r = __bind(md, table, &limits); |
1523 | 2218 | ||
1524 | out: | 2219 | out: |
1525 | mutex_unlock(&md->suspend_lock); | 2220 | mutex_unlock(&md->suspend_lock); |
1526 | return r; | 2221 | return r; |
1527 | } | 2222 | } |
1528 | 2223 | ||
2224 | static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) | ||
2225 | { | ||
2226 | md->suspend_rq.special = (void *)0x1; | ||
2227 | } | ||
2228 | |||
2229 | static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) | ||
2230 | { | ||
2231 | struct request_queue *q = md->queue; | ||
2232 | unsigned long flags; | ||
2233 | |||
2234 | spin_lock_irqsave(q->queue_lock, flags); | ||
2235 | if (!noflush) | ||
2236 | dm_rq_invalidate_suspend_marker(md); | ||
2237 | __start_queue(q); | ||
2238 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2239 | } | ||
2240 | |||
2241 | static void dm_rq_start_suspend(struct mapped_device *md, int noflush) | ||
2242 | { | ||
2243 | struct request *rq = &md->suspend_rq; | ||
2244 | struct request_queue *q = md->queue; | ||
2245 | |||
2246 | if (noflush) | ||
2247 | stop_queue(q); | ||
2248 | else { | ||
2249 | blk_rq_init(q, rq); | ||
2250 | blk_insert_request(q, rq, 0, NULL); | ||
2251 | } | ||
2252 | } | ||
2253 | |||
2254 | static int dm_rq_suspend_available(struct mapped_device *md, int noflush) | ||
2255 | { | ||
2256 | int r = 1; | ||
2257 | struct request *rq = &md->suspend_rq; | ||
2258 | struct request_queue *q = md->queue; | ||
2259 | unsigned long flags; | ||
2260 | |||
2261 | if (noflush) | ||
2262 | return r; | ||
2263 | |||
2264 | /* The marker must be protected by queue lock if it is in use */ | ||
2265 | spin_lock_irqsave(q->queue_lock, flags); | ||
2266 | if (unlikely(rq->ref_count)) { | ||
2267 | /* | ||
2268 | * This can happen, when the previous flush suspend was | ||
2269 | * interrupted, the marker is still in the queue and | ||
2270 | * this flush suspend has been invoked, because we don't | ||
2271 | * remove the marker at the time of suspend interruption. | ||
2272 | * We have only one marker per mapped_device, so we can't | ||
2273 | * start another flush suspend while it is in use. | ||
2274 | */ | ||
2275 | BUG_ON(!rq->special); /* The marker should be invalidated */ | ||
2276 | DMWARN("Invalidating the previous flush suspend is still in" | ||
2277 | " progress. Please retry later."); | ||
2278 | r = 0; | ||
2279 | } | ||
2280 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2281 | |||
2282 | return r; | ||
2283 | } | ||
2284 | |||
1529 | /* | 2285 | /* |
1530 | * Functions to lock and unlock any filesystem running on the | 2286 | * Functions to lock and unlock any filesystem running on the |
1531 | * device. | 2287 | * device. |
@@ -1536,7 +2292,7 @@ static int lock_fs(struct mapped_device *md) | |||
1536 | 2292 | ||
1537 | WARN_ON(md->frozen_sb); | 2293 | WARN_ON(md->frozen_sb); |
1538 | 2294 | ||
1539 | md->frozen_sb = freeze_bdev(md->suspended_bdev); | 2295 | md->frozen_sb = freeze_bdev(md->bdev); |
1540 | if (IS_ERR(md->frozen_sb)) { | 2296 | if (IS_ERR(md->frozen_sb)) { |
1541 | r = PTR_ERR(md->frozen_sb); | 2297 | r = PTR_ERR(md->frozen_sb); |
1542 | md->frozen_sb = NULL; | 2298 | md->frozen_sb = NULL; |
@@ -1545,9 +2301,6 @@ static int lock_fs(struct mapped_device *md) | |||
1545 | 2301 | ||
1546 | set_bit(DMF_FROZEN, &md->flags); | 2302 | set_bit(DMF_FROZEN, &md->flags); |
1547 | 2303 | ||
1548 | /* don't bdput right now, we don't want the bdev | ||
1549 | * to go away while it is locked. | ||
1550 | */ | ||
1551 | return 0; | 2304 | return 0; |
1552 | } | 2305 | } |
1553 | 2306 | ||
@@ -1556,7 +2309,7 @@ static void unlock_fs(struct mapped_device *md) | |||
1556 | if (!test_bit(DMF_FROZEN, &md->flags)) | 2309 | if (!test_bit(DMF_FROZEN, &md->flags)) |
1557 | return; | 2310 | return; |
1558 | 2311 | ||
1559 | thaw_bdev(md->suspended_bdev, md->frozen_sb); | 2312 | thaw_bdev(md->bdev, md->frozen_sb); |
1560 | md->frozen_sb = NULL; | 2313 | md->frozen_sb = NULL; |
1561 | clear_bit(DMF_FROZEN, &md->flags); | 2314 | clear_bit(DMF_FROZEN, &md->flags); |
1562 | } | 2315 | } |
@@ -1568,6 +2321,53 @@ static void unlock_fs(struct mapped_device *md) | |||
1568 | * dm_bind_table, dm_suspend must be called to flush any in | 2321 | * dm_bind_table, dm_suspend must be called to flush any in |
1569 | * flight bios and ensure that any further io gets deferred. | 2322 | * flight bios and ensure that any further io gets deferred. |
1570 | */ | 2323 | */ |
2324 | /* | ||
2325 | * Suspend mechanism in request-based dm. | ||
2326 | * | ||
2327 | * After the suspend starts, further incoming requests are kept in | ||
2328 | * the request_queue and deferred. | ||
2329 | * Remaining requests in the request_queue at the start of suspend are flushed | ||
2330 | * if it is flush suspend. | ||
2331 | * The suspend completes when the following conditions have been satisfied, | ||
2332 | * so wait for it: | ||
2333 | * 1. q->in_flight is 0 (which means no in_flight request) | ||
2334 | * 2. queue has been stopped (which means no request dispatching) | ||
2335 | * | ||
2336 | * | ||
2337 | * Noflush suspend | ||
2338 | * --------------- | ||
2339 | * Noflush suspend doesn't need to dispatch remaining requests. | ||
2340 | * So stop the queue immediately. Then, wait for all in_flight requests | ||
2341 | * to be completed or requeued. | ||
2342 | * | ||
2343 | * To abort noflush suspend, start the queue. | ||
2344 | * | ||
2345 | * | ||
2346 | * Flush suspend | ||
2347 | * ------------- | ||
2348 | * Flush suspend needs to dispatch remaining requests. So stop the queue | ||
2349 | * after the remaining requests are completed. (Requeued request must be also | ||
2350 | * re-dispatched and completed. Until then, we can't stop the queue.) | ||
2351 | * | ||
2352 | * During flushing the remaining requests, further incoming requests are also | ||
2353 | * inserted to the same queue. To distinguish which requests are to be | ||
2354 | * flushed, we insert a marker request to the queue at the time of starting | ||
2355 | * flush suspend, like a barrier. | ||
2356 | * The dispatching is blocked when the marker is found on the top of the queue. | ||
2357 | * And the queue is stopped when all in_flight requests are completed, since | ||
2358 | * that means the remaining requests are completely flushed. | ||
2359 | * Then, the marker is removed from the queue. | ||
2360 | * | ||
2361 | * To abort flush suspend, we also need to take care of the marker, not only | ||
2362 | * starting the queue. | ||
2363 | * We don't remove the marker forcibly from the queue since it's against | ||
2364 | * the block-layer manner. Instead, we put a invalidated mark on the marker. | ||
2365 | * When the invalidated marker is found on the top of the queue, it is | ||
2366 | * immediately removed from the queue, so it doesn't block dispatching. | ||
2367 | * Because we have only one marker per mapped_device, we can't start another | ||
2368 | * flush suspend until the invalidated marker is removed from the queue. | ||
2369 | * So fail and return with -EBUSY in such a case. | ||
2370 | */ | ||
1571 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | 2371 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) |
1572 | { | 2372 | { |
1573 | struct dm_table *map = NULL; | 2373 | struct dm_table *map = NULL; |
@@ -1582,6 +2382,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
1582 | goto out_unlock; | 2382 | goto out_unlock; |
1583 | } | 2383 | } |
1584 | 2384 | ||
2385 | if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { | ||
2386 | r = -EBUSY; | ||
2387 | goto out_unlock; | ||
2388 | } | ||
2389 | |||
1585 | map = dm_get_table(md); | 2390 | map = dm_get_table(md); |
1586 | 2391 | ||
1587 | /* | 2392 | /* |
@@ -1594,24 +2399,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
1594 | /* This does not get reverted if there's an error later. */ | 2399 | /* This does not get reverted if there's an error later. */ |
1595 | dm_table_presuspend_targets(map); | 2400 | dm_table_presuspend_targets(map); |
1596 | 2401 | ||
1597 | /* bdget() can stall if the pending I/Os are not flushed */ | 2402 | /* |
1598 | if (!noflush) { | 2403 | * Flush I/O to the device. noflush supersedes do_lockfs, |
1599 | md->suspended_bdev = bdget_disk(md->disk, 0); | 2404 | * because lock_fs() needs to flush I/Os. |
1600 | if (!md->suspended_bdev) { | 2405 | */ |
1601 | DMWARN("bdget failed in dm_suspend"); | 2406 | if (!noflush && do_lockfs) { |
1602 | r = -ENOMEM; | 2407 | r = lock_fs(md); |
2408 | if (r) | ||
1603 | goto out; | 2409 | goto out; |
1604 | } | ||
1605 | |||
1606 | /* | ||
1607 | * Flush I/O to the device. noflush supersedes do_lockfs, | ||
1608 | * because lock_fs() needs to flush I/Os. | ||
1609 | */ | ||
1610 | if (do_lockfs) { | ||
1611 | r = lock_fs(md); | ||
1612 | if (r) | ||
1613 | goto out; | ||
1614 | } | ||
1615 | } | 2410 | } |
1616 | 2411 | ||
1617 | /* | 2412 | /* |
@@ -1637,6 +2432,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
1637 | 2432 | ||
1638 | flush_workqueue(md->wq); | 2433 | flush_workqueue(md->wq); |
1639 | 2434 | ||
2435 | if (dm_request_based(md)) | ||
2436 | dm_rq_start_suspend(md, noflush); | ||
2437 | |||
1640 | /* | 2438 | /* |
1641 | * At this point no more requests are entering target request routines. | 2439 | * At this point no more requests are entering target request routines. |
1642 | * We call dm_wait_for_completion to wait for all existing requests | 2440 | * We call dm_wait_for_completion to wait for all existing requests |
@@ -1653,6 +2451,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
1653 | if (r < 0) { | 2451 | if (r < 0) { |
1654 | dm_queue_flush(md); | 2452 | dm_queue_flush(md); |
1655 | 2453 | ||
2454 | if (dm_request_based(md)) | ||
2455 | dm_rq_abort_suspend(md, noflush); | ||
2456 | |||
1656 | unlock_fs(md); | 2457 | unlock_fs(md); |
1657 | goto out; /* pushback list is already flushed, so skip flush */ | 2458 | goto out; /* pushback list is already flushed, so skip flush */ |
1658 | } | 2459 | } |
@@ -1668,11 +2469,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
1668 | set_bit(DMF_SUSPENDED, &md->flags); | 2469 | set_bit(DMF_SUSPENDED, &md->flags); |
1669 | 2470 | ||
1670 | out: | 2471 | out: |
1671 | if (r && md->suspended_bdev) { | ||
1672 | bdput(md->suspended_bdev); | ||
1673 | md->suspended_bdev = NULL; | ||
1674 | } | ||
1675 | |||
1676 | dm_table_put(map); | 2472 | dm_table_put(map); |
1677 | 2473 | ||
1678 | out_unlock: | 2474 | out_unlock: |
@@ -1699,21 +2495,20 @@ int dm_resume(struct mapped_device *md) | |||
1699 | 2495 | ||
1700 | dm_queue_flush(md); | 2496 | dm_queue_flush(md); |
1701 | 2497 | ||
1702 | unlock_fs(md); | 2498 | /* |
2499 | * Flushing deferred I/Os must be done after targets are resumed | ||
2500 | * so that mapping of targets can work correctly. | ||
2501 | * Request-based dm is queueing the deferred I/Os in its request_queue. | ||
2502 | */ | ||
2503 | if (dm_request_based(md)) | ||
2504 | start_queue(md->queue); | ||
1703 | 2505 | ||
1704 | if (md->suspended_bdev) { | 2506 | unlock_fs(md); |
1705 | bdput(md->suspended_bdev); | ||
1706 | md->suspended_bdev = NULL; | ||
1707 | } | ||
1708 | 2507 | ||
1709 | clear_bit(DMF_SUSPENDED, &md->flags); | 2508 | clear_bit(DMF_SUSPENDED, &md->flags); |
1710 | 2509 | ||
1711 | dm_table_unplug_all(map); | 2510 | dm_table_unplug_all(map); |
1712 | |||
1713 | dm_kobject_uevent(md); | ||
1714 | |||
1715 | r = 0; | 2511 | r = 0; |
1716 | |||
1717 | out: | 2512 | out: |
1718 | dm_table_put(map); | 2513 | dm_table_put(map); |
1719 | mutex_unlock(&md->suspend_lock); | 2514 | mutex_unlock(&md->suspend_lock); |
@@ -1724,9 +2519,19 @@ out: | |||
1724 | /*----------------------------------------------------------------- | 2519 | /*----------------------------------------------------------------- |
1725 | * Event notification. | 2520 | * Event notification. |
1726 | *---------------------------------------------------------------*/ | 2521 | *---------------------------------------------------------------*/ |
1727 | void dm_kobject_uevent(struct mapped_device *md) | 2522 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
1728 | { | 2523 | unsigned cookie) |
1729 | kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE); | 2524 | { |
2525 | char udev_cookie[DM_COOKIE_LENGTH]; | ||
2526 | char *envp[] = { udev_cookie, NULL }; | ||
2527 | |||
2528 | if (!cookie) | ||
2529 | kobject_uevent(&disk_to_dev(md->disk)->kobj, action); | ||
2530 | else { | ||
2531 | snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", | ||
2532 | DM_COOKIE_ENV_VAR_NAME, cookie); | ||
2533 | kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); | ||
2534 | } | ||
1730 | } | 2535 | } |
1731 | 2536 | ||
1732 | uint32_t dm_next_uevent_seq(struct mapped_device *md) | 2537 | uint32_t dm_next_uevent_seq(struct mapped_device *md) |
@@ -1780,6 +2585,10 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) | |||
1780 | if (&md->kobj != kobj) | 2585 | if (&md->kobj != kobj) |
1781 | return NULL; | 2586 | return NULL; |
1782 | 2587 | ||
2588 | if (test_bit(DMF_FREEING, &md->flags) || | ||
2589 | test_bit(DMF_DELETING, &md->flags)) | ||
2590 | return NULL; | ||
2591 | |||
1783 | dm_get(md); | 2592 | dm_get(md); |
1784 | return md; | 2593 | return md; |
1785 | } | 2594 | } |
@@ -1800,6 +2609,61 @@ int dm_noflush_suspending(struct dm_target *ti) | |||
1800 | } | 2609 | } |
1801 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); | 2610 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); |
1802 | 2611 | ||
2612 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) | ||
2613 | { | ||
2614 | struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); | ||
2615 | |||
2616 | if (!pools) | ||
2617 | return NULL; | ||
2618 | |||
2619 | pools->io_pool = (type == DM_TYPE_BIO_BASED) ? | ||
2620 | mempool_create_slab_pool(MIN_IOS, _io_cache) : | ||
2621 | mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); | ||
2622 | if (!pools->io_pool) | ||
2623 | goto free_pools_and_out; | ||
2624 | |||
2625 | pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? | ||
2626 | mempool_create_slab_pool(MIN_IOS, _tio_cache) : | ||
2627 | mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); | ||
2628 | if (!pools->tio_pool) | ||
2629 | goto free_io_pool_and_out; | ||
2630 | |||
2631 | pools->bs = (type == DM_TYPE_BIO_BASED) ? | ||
2632 | bioset_create(16, 0) : bioset_create(MIN_IOS, 0); | ||
2633 | if (!pools->bs) | ||
2634 | goto free_tio_pool_and_out; | ||
2635 | |||
2636 | return pools; | ||
2637 | |||
2638 | free_tio_pool_and_out: | ||
2639 | mempool_destroy(pools->tio_pool); | ||
2640 | |||
2641 | free_io_pool_and_out: | ||
2642 | mempool_destroy(pools->io_pool); | ||
2643 | |||
2644 | free_pools_and_out: | ||
2645 | kfree(pools); | ||
2646 | |||
2647 | return NULL; | ||
2648 | } | ||
2649 | |||
2650 | void dm_free_md_mempools(struct dm_md_mempools *pools) | ||
2651 | { | ||
2652 | if (!pools) | ||
2653 | return; | ||
2654 | |||
2655 | if (pools->io_pool) | ||
2656 | mempool_destroy(pools->io_pool); | ||
2657 | |||
2658 | if (pools->tio_pool) | ||
2659 | mempool_destroy(pools->tio_pool); | ||
2660 | |||
2661 | if (pools->bs) | ||
2662 | bioset_free(pools->bs); | ||
2663 | |||
2664 | kfree(pools); | ||
2665 | } | ||
2666 | |||
1803 | static struct block_device_operations dm_blk_dops = { | 2667 | static struct block_device_operations dm_blk_dops = { |
1804 | .open = dm_blk_open, | 2668 | .open = dm_blk_open, |
1805 | .release = dm_blk_close, | 2669 | .release = dm_blk_close, |
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a31506d93e91..23278ae80f08 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -23,6 +23,13 @@ | |||
23 | #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) | 23 | #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) |
24 | 24 | ||
25 | /* | 25 | /* |
26 | * Type of table and mapped_device's mempool | ||
27 | */ | ||
28 | #define DM_TYPE_NONE 0 | ||
29 | #define DM_TYPE_BIO_BASED 1 | ||
30 | #define DM_TYPE_REQUEST_BASED 2 | ||
31 | |||
32 | /* | ||
26 | * List of devices that a metadevice uses and should open/close. | 33 | * List of devices that a metadevice uses and should open/close. |
27 | */ | 34 | */ |
28 | struct dm_dev_internal { | 35 | struct dm_dev_internal { |
@@ -32,6 +39,7 @@ struct dm_dev_internal { | |||
32 | }; | 39 | }; |
33 | 40 | ||
34 | struct dm_table; | 41 | struct dm_table; |
42 | struct dm_md_mempools; | ||
35 | 43 | ||
36 | /*----------------------------------------------------------------- | 44 | /*----------------------------------------------------------------- |
37 | * Internal table functions. | 45 | * Internal table functions. |
@@ -41,18 +49,34 @@ void dm_table_event_callback(struct dm_table *t, | |||
41 | void (*fn)(void *), void *context); | 49 | void (*fn)(void *), void *context); |
42 | struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); | 50 | struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); |
43 | struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); | 51 | struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); |
44 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); | 52 | int dm_calculate_queue_limits(struct dm_table *table, |
53 | struct queue_limits *limits); | ||
54 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, | ||
55 | struct queue_limits *limits); | ||
45 | struct list_head *dm_table_get_devices(struct dm_table *t); | 56 | struct list_head *dm_table_get_devices(struct dm_table *t); |
46 | void dm_table_presuspend_targets(struct dm_table *t); | 57 | void dm_table_presuspend_targets(struct dm_table *t); |
47 | void dm_table_postsuspend_targets(struct dm_table *t); | 58 | void dm_table_postsuspend_targets(struct dm_table *t); |
48 | int dm_table_resume_targets(struct dm_table *t); | 59 | int dm_table_resume_targets(struct dm_table *t); |
49 | int dm_table_any_congested(struct dm_table *t, int bdi_bits); | 60 | int dm_table_any_congested(struct dm_table *t, int bdi_bits); |
61 | int dm_table_any_busy_target(struct dm_table *t); | ||
62 | int dm_table_set_type(struct dm_table *t); | ||
63 | unsigned dm_table_get_type(struct dm_table *t); | ||
64 | bool dm_table_bio_based(struct dm_table *t); | ||
65 | bool dm_table_request_based(struct dm_table *t); | ||
66 | int dm_table_alloc_md_mempools(struct dm_table *t); | ||
67 | void dm_table_free_md_mempools(struct dm_table *t); | ||
68 | struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); | ||
50 | 69 | ||
51 | /* | 70 | /* |
52 | * To check the return value from dm_table_find_target(). | 71 | * To check the return value from dm_table_find_target(). |
53 | */ | 72 | */ |
54 | #define dm_target_is_valid(t) ((t)->table) | 73 | #define dm_target_is_valid(t) ((t)->table) |
55 | 74 | ||
75 | /* | ||
76 | * To check whether the target type is request-based or not (bio-based). | ||
77 | */ | ||
78 | #define dm_target_request_based(t) ((t)->type->map_rq != NULL) | ||
79 | |||
56 | /*----------------------------------------------------------------- | 80 | /*----------------------------------------------------------------- |
57 | * A registry of target types. | 81 | * A registry of target types. |
58 | *---------------------------------------------------------------*/ | 82 | *---------------------------------------------------------------*/ |
@@ -92,9 +116,16 @@ void dm_stripe_exit(void); | |||
92 | int dm_open_count(struct mapped_device *md); | 116 | int dm_open_count(struct mapped_device *md); |
93 | int dm_lock_for_deletion(struct mapped_device *md); | 117 | int dm_lock_for_deletion(struct mapped_device *md); |
94 | 118 | ||
95 | void dm_kobject_uevent(struct mapped_device *md); | 119 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
120 | unsigned cookie); | ||
96 | 121 | ||
97 | int dm_kcopyd_init(void); | 122 | int dm_kcopyd_init(void); |
98 | void dm_kcopyd_exit(void); | 123 | void dm_kcopyd_exit(void); |
99 | 124 | ||
125 | /* | ||
126 | * Mempool operations | ||
127 | */ | ||
128 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type); | ||
129 | void dm_free_md_mempools(struct dm_md_mempools *pools); | ||
130 | |||
100 | #endif | 131 | #endif |
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 8695809b24b0..87d88dbb667f 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c | |||
@@ -255,14 +255,14 @@ static void status(struct seq_file *seq, mddev_t *mddev) | |||
255 | } | 255 | } |
256 | 256 | ||
257 | 257 | ||
258 | static int reconfig(mddev_t *mddev, int layout, int chunk_size) | 258 | static int reshape(mddev_t *mddev) |
259 | { | 259 | { |
260 | int mode = layout & ModeMask; | 260 | int mode = mddev->new_layout & ModeMask; |
261 | int count = layout >> ModeShift; | 261 | int count = mddev->new_layout >> ModeShift; |
262 | conf_t *conf = mddev->private; | 262 | conf_t *conf = mddev->private; |
263 | 263 | ||
264 | if (chunk_size != -1) | 264 | if (mddev->new_layout < 0) |
265 | return -EINVAL; | 265 | return 0; |
266 | 266 | ||
267 | /* new layout */ | 267 | /* new layout */ |
268 | if (mode == ClearFaults) | 268 | if (mode == ClearFaults) |
@@ -279,6 +279,7 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size) | |||
279 | atomic_set(&conf->counters[mode], count); | 279 | atomic_set(&conf->counters[mode], count); |
280 | } else | 280 | } else |
281 | return -EINVAL; | 281 | return -EINVAL; |
282 | mddev->new_layout = -1; | ||
282 | mddev->layout = -1; /* makes sure further changes come through */ | 283 | mddev->layout = -1; /* makes sure further changes come through */ |
283 | return 0; | 284 | return 0; |
284 | } | 285 | } |
@@ -298,8 +299,12 @@ static int run(mddev_t *mddev) | |||
298 | { | 299 | { |
299 | mdk_rdev_t *rdev; | 300 | mdk_rdev_t *rdev; |
300 | int i; | 301 | int i; |
302 | conf_t *conf; | ||
303 | |||
304 | if (md_check_no_bitmap(mddev)) | ||
305 | return -EINVAL; | ||
301 | 306 | ||
302 | conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL); | 307 | conf = kmalloc(sizeof(*conf), GFP_KERNEL); |
303 | if (!conf) | 308 | if (!conf) |
304 | return -ENOMEM; | 309 | return -ENOMEM; |
305 | 310 | ||
@@ -315,7 +320,7 @@ static int run(mddev_t *mddev) | |||
315 | md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); | 320 | md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); |
316 | mddev->private = conf; | 321 | mddev->private = conf; |
317 | 322 | ||
318 | reconfig(mddev, mddev->layout, -1); | 323 | reshape(mddev); |
319 | 324 | ||
320 | return 0; | 325 | return 0; |
321 | } | 326 | } |
@@ -338,7 +343,7 @@ static struct mdk_personality faulty_personality = | |||
338 | .run = run, | 343 | .run = run, |
339 | .stop = stop, | 344 | .stop = stop, |
340 | .status = status, | 345 | .status = status, |
341 | .reconfig = reconfig, | 346 | .check_reshape = reshape, |
342 | .size = faulty_size, | 347 | .size = faulty_size, |
343 | }; | 348 | }; |
344 | 349 | ||
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 7a36e38393a1..15c8b7b25a9b 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -27,19 +27,27 @@ | |||
27 | */ | 27 | */ |
28 | static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) | 28 | static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) |
29 | { | 29 | { |
30 | dev_info_t *hash; | 30 | int lo, mid, hi; |
31 | linear_conf_t *conf = mddev_to_conf(mddev); | 31 | linear_conf_t *conf; |
32 | sector_t idx = sector >> conf->sector_shift; | 32 | |
33 | lo = 0; | ||
34 | hi = mddev->raid_disks - 1; | ||
35 | conf = rcu_dereference(mddev->private); | ||
33 | 36 | ||
34 | /* | 37 | /* |
35 | * sector_div(a,b) returns the remainer and sets a to a/b | 38 | * Binary Search |
36 | */ | 39 | */ |
37 | (void)sector_div(idx, conf->spacing); | ||
38 | hash = conf->hash_table[idx]; | ||
39 | 40 | ||
40 | while (sector >= hash->num_sectors + hash->start_sector) | 41 | while (hi > lo) { |
41 | hash++; | 42 | |
42 | return hash; | 43 | mid = (hi + lo) / 2; |
44 | if (sector < conf->disks[mid].end_sector) | ||
45 | hi = mid; | ||
46 | else | ||
47 | lo = mid + 1; | ||
48 | } | ||
49 | |||
50 | return conf->disks + lo; | ||
43 | } | 51 | } |
44 | 52 | ||
45 | /** | 53 | /** |
@@ -59,8 +67,10 @@ static int linear_mergeable_bvec(struct request_queue *q, | |||
59 | unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; | 67 | unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; |
60 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 68 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
61 | 69 | ||
70 | rcu_read_lock(); | ||
62 | dev0 = which_dev(mddev, sector); | 71 | dev0 = which_dev(mddev, sector); |
63 | maxsectors = dev0->num_sectors - (sector - dev0->start_sector); | 72 | maxsectors = dev0->end_sector - sector; |
73 | rcu_read_unlock(); | ||
64 | 74 | ||
65 | if (maxsectors < bio_sectors) | 75 | if (maxsectors < bio_sectors) |
66 | maxsectors = 0; | 76 | maxsectors = 0; |
@@ -79,46 +89,57 @@ static int linear_mergeable_bvec(struct request_queue *q, | |||
79 | static void linear_unplug(struct request_queue *q) | 89 | static void linear_unplug(struct request_queue *q) |
80 | { | 90 | { |
81 | mddev_t *mddev = q->queuedata; | 91 | mddev_t *mddev = q->queuedata; |
82 | linear_conf_t *conf = mddev_to_conf(mddev); | 92 | linear_conf_t *conf; |
83 | int i; | 93 | int i; |
84 | 94 | ||
95 | rcu_read_lock(); | ||
96 | conf = rcu_dereference(mddev->private); | ||
97 | |||
85 | for (i=0; i < mddev->raid_disks; i++) { | 98 | for (i=0; i < mddev->raid_disks; i++) { |
86 | struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); | 99 | struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); |
87 | blk_unplug(r_queue); | 100 | blk_unplug(r_queue); |
88 | } | 101 | } |
102 | rcu_read_unlock(); | ||
89 | } | 103 | } |
90 | 104 | ||
91 | static int linear_congested(void *data, int bits) | 105 | static int linear_congested(void *data, int bits) |
92 | { | 106 | { |
93 | mddev_t *mddev = data; | 107 | mddev_t *mddev = data; |
94 | linear_conf_t *conf = mddev_to_conf(mddev); | 108 | linear_conf_t *conf; |
95 | int i, ret = 0; | 109 | int i, ret = 0; |
96 | 110 | ||
111 | rcu_read_lock(); | ||
112 | conf = rcu_dereference(mddev->private); | ||
113 | |||
97 | for (i = 0; i < mddev->raid_disks && !ret ; i++) { | 114 | for (i = 0; i < mddev->raid_disks && !ret ; i++) { |
98 | struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); | 115 | struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); |
99 | ret |= bdi_congested(&q->backing_dev_info, bits); | 116 | ret |= bdi_congested(&q->backing_dev_info, bits); |
100 | } | 117 | } |
118 | |||
119 | rcu_read_unlock(); | ||
101 | return ret; | 120 | return ret; |
102 | } | 121 | } |
103 | 122 | ||
104 | static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) | 123 | static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) |
105 | { | 124 | { |
106 | linear_conf_t *conf = mddev_to_conf(mddev); | 125 | linear_conf_t *conf; |
126 | sector_t array_sectors; | ||
107 | 127 | ||
128 | rcu_read_lock(); | ||
129 | conf = rcu_dereference(mddev->private); | ||
108 | WARN_ONCE(sectors || raid_disks, | 130 | WARN_ONCE(sectors || raid_disks, |
109 | "%s does not support generic reshape\n", __func__); | 131 | "%s does not support generic reshape\n", __func__); |
132 | array_sectors = conf->array_sectors; | ||
133 | rcu_read_unlock(); | ||
110 | 134 | ||
111 | return conf->array_sectors; | 135 | return array_sectors; |
112 | } | 136 | } |
113 | 137 | ||
114 | static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | 138 | static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) |
115 | { | 139 | { |
116 | linear_conf_t *conf; | 140 | linear_conf_t *conf; |
117 | dev_info_t **table; | ||
118 | mdk_rdev_t *rdev; | 141 | mdk_rdev_t *rdev; |
119 | int i, nb_zone, cnt; | 142 | int i, cnt; |
120 | sector_t min_sectors; | ||
121 | sector_t curr_sector; | ||
122 | 143 | ||
123 | conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t), | 144 | conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t), |
124 | GFP_KERNEL); | 145 | GFP_KERNEL); |
@@ -131,6 +152,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
131 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 152 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
132 | int j = rdev->raid_disk; | 153 | int j = rdev->raid_disk; |
133 | dev_info_t *disk = conf->disks + j; | 154 | dev_info_t *disk = conf->disks + j; |
155 | sector_t sectors; | ||
134 | 156 | ||
135 | if (j < 0 || j >= raid_disks || disk->rdev) { | 157 | if (j < 0 || j >= raid_disks || disk->rdev) { |
136 | printk("linear: disk numbering problem. Aborting!\n"); | 158 | printk("linear: disk numbering problem. Aborting!\n"); |
@@ -138,6 +160,11 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
138 | } | 160 | } |
139 | 161 | ||
140 | disk->rdev = rdev; | 162 | disk->rdev = rdev; |
163 | if (mddev->chunk_sectors) { | ||
164 | sectors = rdev->sectors; | ||
165 | sector_div(sectors, mddev->chunk_sectors); | ||
166 | rdev->sectors = sectors * mddev->chunk_sectors; | ||
167 | } | ||
141 | 168 | ||
142 | blk_queue_stack_limits(mddev->queue, | 169 | blk_queue_stack_limits(mddev->queue, |
143 | rdev->bdev->bd_disk->queue); | 170 | rdev->bdev->bd_disk->queue); |
@@ -146,105 +173,27 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
146 | * a one page request is never in violation. | 173 | * a one page request is never in violation. |
147 | */ | 174 | */ |
148 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 175 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && |
149 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 176 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
150 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 177 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
151 | 178 | ||
152 | disk->num_sectors = rdev->sectors; | ||
153 | conf->array_sectors += rdev->sectors; | 179 | conf->array_sectors += rdev->sectors; |
154 | |||
155 | cnt++; | 180 | cnt++; |
181 | |||
156 | } | 182 | } |
157 | if (cnt != raid_disks) { | 183 | if (cnt != raid_disks) { |
158 | printk("linear: not enough drives present. Aborting!\n"); | 184 | printk("linear: not enough drives present. Aborting!\n"); |
159 | goto out; | 185 | goto out; |
160 | } | 186 | } |
161 | 187 | ||
162 | min_sectors = conf->array_sectors; | ||
163 | sector_div(min_sectors, PAGE_SIZE/sizeof(struct dev_info *)); | ||
164 | if (min_sectors == 0) | ||
165 | min_sectors = 1; | ||
166 | |||
167 | /* min_sectors is the minimum spacing that will fit the hash | ||
168 | * table in one PAGE. This may be much smaller than needed. | ||
169 | * We find the smallest non-terminal set of consecutive devices | ||
170 | * that is larger than min_sectors and use the size of that as | ||
171 | * the actual spacing | ||
172 | */ | ||
173 | conf->spacing = conf->array_sectors; | ||
174 | for (i=0; i < cnt-1 ; i++) { | ||
175 | sector_t tmp = 0; | ||
176 | int j; | ||
177 | for (j = i; j < cnt - 1 && tmp < min_sectors; j++) | ||
178 | tmp += conf->disks[j].num_sectors; | ||
179 | if (tmp >= min_sectors && tmp < conf->spacing) | ||
180 | conf->spacing = tmp; | ||
181 | } | ||
182 | |||
183 | /* spacing may be too large for sector_div to work with, | ||
184 | * so we might need to pre-shift | ||
185 | */ | ||
186 | conf->sector_shift = 0; | ||
187 | if (sizeof(sector_t) > sizeof(u32)) { | ||
188 | sector_t space = conf->spacing; | ||
189 | while (space > (sector_t)(~(u32)0)) { | ||
190 | space >>= 1; | ||
191 | conf->sector_shift++; | ||
192 | } | ||
193 | } | ||
194 | /* | 188 | /* |
195 | * This code was restructured to work around a gcc-2.95.3 internal | 189 | * Here we calculate the device offsets. |
196 | * compiler error. Alter it with care. | ||
197 | */ | 190 | */ |
198 | { | 191 | conf->disks[0].end_sector = conf->disks[0].rdev->sectors; |
199 | sector_t sz; | ||
200 | unsigned round; | ||
201 | unsigned long base; | ||
202 | |||
203 | sz = conf->array_sectors >> conf->sector_shift; | ||
204 | sz += 1; /* force round-up */ | ||
205 | base = conf->spacing >> conf->sector_shift; | ||
206 | round = sector_div(sz, base); | ||
207 | nb_zone = sz + (round ? 1 : 0); | ||
208 | } | ||
209 | BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *)); | ||
210 | |||
211 | conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone, | ||
212 | GFP_KERNEL); | ||
213 | if (!conf->hash_table) | ||
214 | goto out; | ||
215 | 192 | ||
216 | /* | ||
217 | * Here we generate the linear hash table | ||
218 | * First calculate the device offsets. | ||
219 | */ | ||
220 | conf->disks[0].start_sector = 0; | ||
221 | for (i = 1; i < raid_disks; i++) | 193 | for (i = 1; i < raid_disks; i++) |
222 | conf->disks[i].start_sector = | 194 | conf->disks[i].end_sector = |
223 | conf->disks[i-1].start_sector + | 195 | conf->disks[i-1].end_sector + |
224 | conf->disks[i-1].num_sectors; | 196 | conf->disks[i].rdev->sectors; |
225 | |||
226 | table = conf->hash_table; | ||
227 | i = 0; | ||
228 | for (curr_sector = 0; | ||
229 | curr_sector < conf->array_sectors; | ||
230 | curr_sector += conf->spacing) { | ||
231 | |||
232 | while (i < raid_disks-1 && | ||
233 | curr_sector >= conf->disks[i+1].start_sector) | ||
234 | i++; | ||
235 | |||
236 | *table ++ = conf->disks + i; | ||
237 | } | ||
238 | |||
239 | if (conf->sector_shift) { | ||
240 | conf->spacing >>= conf->sector_shift; | ||
241 | /* round spacing up so that when we divide by it, | ||
242 | * we err on the side of "too-low", which is safest. | ||
243 | */ | ||
244 | conf->spacing++; | ||
245 | } | ||
246 | |||
247 | BUG_ON(table - conf->hash_table > nb_zone); | ||
248 | 197 | ||
249 | return conf; | 198 | return conf; |
250 | 199 | ||
@@ -257,6 +206,8 @@ static int linear_run (mddev_t *mddev) | |||
257 | { | 206 | { |
258 | linear_conf_t *conf; | 207 | linear_conf_t *conf; |
259 | 208 | ||
209 | if (md_check_no_bitmap(mddev)) | ||
210 | return -EINVAL; | ||
260 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; | 211 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; |
261 | conf = linear_conf(mddev, mddev->raid_disks); | 212 | conf = linear_conf(mddev, mddev->raid_disks); |
262 | 213 | ||
@@ -272,6 +223,12 @@ static int linear_run (mddev_t *mddev) | |||
272 | return 0; | 223 | return 0; |
273 | } | 224 | } |
274 | 225 | ||
226 | static void free_conf(struct rcu_head *head) | ||
227 | { | ||
228 | linear_conf_t *conf = container_of(head, linear_conf_t, rcu); | ||
229 | kfree(conf); | ||
230 | } | ||
231 | |||
275 | static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) | 232 | static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) |
276 | { | 233 | { |
277 | /* Adding a drive to a linear array allows the array to grow. | 234 | /* Adding a drive to a linear array allows the array to grow. |
@@ -282,7 +239,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) | |||
282 | * The current one is never freed until the array is stopped. | 239 | * The current one is never freed until the array is stopped. |
283 | * This avoids races. | 240 | * This avoids races. |
284 | */ | 241 | */ |
285 | linear_conf_t *newconf; | 242 | linear_conf_t *newconf, *oldconf; |
286 | 243 | ||
287 | if (rdev->saved_raid_disk != mddev->raid_disks) | 244 | if (rdev->saved_raid_disk != mddev->raid_disks) |
288 | return -EINVAL; | 245 | return -EINVAL; |
@@ -294,25 +251,29 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) | |||
294 | if (!newconf) | 251 | if (!newconf) |
295 | return -ENOMEM; | 252 | return -ENOMEM; |
296 | 253 | ||
297 | newconf->prev = mddev_to_conf(mddev); | 254 | oldconf = rcu_dereference(mddev->private); |
298 | mddev->private = newconf; | ||
299 | mddev->raid_disks++; | 255 | mddev->raid_disks++; |
256 | rcu_assign_pointer(mddev->private, newconf); | ||
300 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); | 257 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); |
301 | set_capacity(mddev->gendisk, mddev->array_sectors); | 258 | set_capacity(mddev->gendisk, mddev->array_sectors); |
259 | call_rcu(&oldconf->rcu, free_conf); | ||
302 | return 0; | 260 | return 0; |
303 | } | 261 | } |
304 | 262 | ||
305 | static int linear_stop (mddev_t *mddev) | 263 | static int linear_stop (mddev_t *mddev) |
306 | { | 264 | { |
307 | linear_conf_t *conf = mddev_to_conf(mddev); | 265 | linear_conf_t *conf = mddev->private; |
308 | 266 | ||
267 | /* | ||
268 | * We do not require rcu protection here since | ||
269 | * we hold reconfig_mutex for both linear_add and | ||
270 | * linear_stop, so they cannot race. | ||
271 | * We should make sure any old 'conf's are properly | ||
272 | * freed though. | ||
273 | */ | ||
274 | rcu_barrier(); | ||
309 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 275 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
310 | do { | 276 | kfree(conf); |
311 | linear_conf_t *t = conf->prev; | ||
312 | kfree(conf->hash_table); | ||
313 | kfree(conf); | ||
314 | conf = t; | ||
315 | } while (conf); | ||
316 | 277 | ||
317 | return 0; | 278 | return 0; |
318 | } | 279 | } |
@@ -322,6 +283,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
322 | const int rw = bio_data_dir(bio); | 283 | const int rw = bio_data_dir(bio); |
323 | mddev_t *mddev = q->queuedata; | 284 | mddev_t *mddev = q->queuedata; |
324 | dev_info_t *tmp_dev; | 285 | dev_info_t *tmp_dev; |
286 | sector_t start_sector; | ||
325 | int cpu; | 287 | int cpu; |
326 | 288 | ||
327 | if (unlikely(bio_barrier(bio))) { | 289 | if (unlikely(bio_barrier(bio))) { |
@@ -335,33 +297,36 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
335 | bio_sectors(bio)); | 297 | bio_sectors(bio)); |
336 | part_stat_unlock(); | 298 | part_stat_unlock(); |
337 | 299 | ||
300 | rcu_read_lock(); | ||
338 | tmp_dev = which_dev(mddev, bio->bi_sector); | 301 | tmp_dev = which_dev(mddev, bio->bi_sector); |
339 | 302 | start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; | |
340 | if (unlikely(bio->bi_sector >= (tmp_dev->num_sectors + | 303 | |
341 | tmp_dev->start_sector) | 304 | |
342 | || (bio->bi_sector < | 305 | if (unlikely(bio->bi_sector >= (tmp_dev->end_sector) |
343 | tmp_dev->start_sector))) { | 306 | || (bio->bi_sector < start_sector))) { |
344 | char b[BDEVNAME_SIZE]; | 307 | char b[BDEVNAME_SIZE]; |
345 | 308 | ||
346 | printk("linear_make_request: Sector %llu out of bounds on " | 309 | printk("linear_make_request: Sector %llu out of bounds on " |
347 | "dev %s: %llu sectors, offset %llu\n", | 310 | "dev %s: %llu sectors, offset %llu\n", |
348 | (unsigned long long)bio->bi_sector, | 311 | (unsigned long long)bio->bi_sector, |
349 | bdevname(tmp_dev->rdev->bdev, b), | 312 | bdevname(tmp_dev->rdev->bdev, b), |
350 | (unsigned long long)tmp_dev->num_sectors, | 313 | (unsigned long long)tmp_dev->rdev->sectors, |
351 | (unsigned long long)tmp_dev->start_sector); | 314 | (unsigned long long)start_sector); |
315 | rcu_read_unlock(); | ||
352 | bio_io_error(bio); | 316 | bio_io_error(bio); |
353 | return 0; | 317 | return 0; |
354 | } | 318 | } |
355 | if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > | 319 | if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > |
356 | tmp_dev->start_sector + tmp_dev->num_sectors)) { | 320 | tmp_dev->end_sector)) { |
357 | /* This bio crosses a device boundary, so we have to | 321 | /* This bio crosses a device boundary, so we have to |
358 | * split it. | 322 | * split it. |
359 | */ | 323 | */ |
360 | struct bio_pair *bp; | 324 | struct bio_pair *bp; |
325 | sector_t end_sector = tmp_dev->end_sector; | ||
326 | |||
327 | rcu_read_unlock(); | ||
361 | 328 | ||
362 | bp = bio_split(bio, | 329 | bp = bio_split(bio, end_sector - bio->bi_sector); |
363 | tmp_dev->start_sector + tmp_dev->num_sectors | ||
364 | - bio->bi_sector); | ||
365 | 330 | ||
366 | if (linear_make_request(q, &bp->bio1)) | 331 | if (linear_make_request(q, &bp->bio1)) |
367 | generic_make_request(&bp->bio1); | 332 | generic_make_request(&bp->bio1); |
@@ -372,8 +337,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
372 | } | 337 | } |
373 | 338 | ||
374 | bio->bi_bdev = tmp_dev->rdev->bdev; | 339 | bio->bi_bdev = tmp_dev->rdev->bdev; |
375 | bio->bi_sector = bio->bi_sector - tmp_dev->start_sector | 340 | bio->bi_sector = bio->bi_sector - start_sector |
376 | + tmp_dev->rdev->data_offset; | 341 | + tmp_dev->rdev->data_offset; |
342 | rcu_read_unlock(); | ||
377 | 343 | ||
378 | return 1; | 344 | return 1; |
379 | } | 345 | } |
@@ -381,7 +347,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
381 | static void linear_status (struct seq_file *seq, mddev_t *mddev) | 347 | static void linear_status (struct seq_file *seq, mddev_t *mddev) |
382 | { | 348 | { |
383 | 349 | ||
384 | seq_printf(seq, " %dk rounding", mddev->chunk_size/1024); | 350 | seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); |
385 | } | 351 | } |
386 | 352 | ||
387 | 353 | ||
diff --git a/drivers/md/linear.h b/drivers/md/linear.h index bf8179587f95..0ce29b61605a 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h | |||
@@ -3,27 +3,19 @@ | |||
3 | 3 | ||
4 | struct dev_info { | 4 | struct dev_info { |
5 | mdk_rdev_t *rdev; | 5 | mdk_rdev_t *rdev; |
6 | sector_t num_sectors; | 6 | sector_t end_sector; |
7 | sector_t start_sector; | ||
8 | }; | 7 | }; |
9 | 8 | ||
10 | typedef struct dev_info dev_info_t; | 9 | typedef struct dev_info dev_info_t; |
11 | 10 | ||
12 | struct linear_private_data | 11 | struct linear_private_data |
13 | { | 12 | { |
14 | struct linear_private_data *prev; /* earlier version */ | ||
15 | dev_info_t **hash_table; | ||
16 | sector_t spacing; | ||
17 | sector_t array_sectors; | 13 | sector_t array_sectors; |
18 | int sector_shift; /* shift before dividing | ||
19 | * by spacing | ||
20 | */ | ||
21 | dev_info_t disks[0]; | 14 | dev_info_t disks[0]; |
15 | struct rcu_head rcu; | ||
22 | }; | 16 | }; |
23 | 17 | ||
24 | 18 | ||
25 | typedef struct linear_private_data linear_conf_t; | 19 | typedef struct linear_private_data linear_conf_t; |
26 | 20 | ||
27 | #define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) | ||
28 | |||
29 | #endif | 21 | #endif |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 641b211fe3fe..09be637d52cb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -440,15 +440,6 @@ static inline sector_t calc_dev_sboffset(struct block_device *bdev) | |||
440 | return MD_NEW_SIZE_SECTORS(num_sectors); | 440 | return MD_NEW_SIZE_SECTORS(num_sectors); |
441 | } | 441 | } |
442 | 442 | ||
443 | static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) | ||
444 | { | ||
445 | sector_t num_sectors = rdev->sb_start; | ||
446 | |||
447 | if (chunk_size) | ||
448 | num_sectors &= ~((sector_t)chunk_size/512 - 1); | ||
449 | return num_sectors; | ||
450 | } | ||
451 | |||
452 | static int alloc_disk_sb(mdk_rdev_t * rdev) | 443 | static int alloc_disk_sb(mdk_rdev_t * rdev) |
453 | { | 444 | { |
454 | if (rdev->sb_page) | 445 | if (rdev->sb_page) |
@@ -745,6 +736,24 @@ struct super_type { | |||
745 | }; | 736 | }; |
746 | 737 | ||
747 | /* | 738 | /* |
739 | * Check that the given mddev has no bitmap. | ||
740 | * | ||
741 | * This function is called from the run method of all personalities that do not | ||
742 | * support bitmaps. It prints an error message and returns non-zero if mddev | ||
743 | * has a bitmap. Otherwise, it returns 0. | ||
744 | * | ||
745 | */ | ||
746 | int md_check_no_bitmap(mddev_t *mddev) | ||
747 | { | ||
748 | if (!mddev->bitmap_file && !mddev->bitmap_offset) | ||
749 | return 0; | ||
750 | printk(KERN_ERR "%s: bitmaps are not supported for %s\n", | ||
751 | mdname(mddev), mddev->pers->name); | ||
752 | return 1; | ||
753 | } | ||
754 | EXPORT_SYMBOL(md_check_no_bitmap); | ||
755 | |||
756 | /* | ||
748 | * load_super for 0.90.0 | 757 | * load_super for 0.90.0 |
749 | */ | 758 | */ |
750 | static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | 759 | static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) |
@@ -797,17 +806,6 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
797 | rdev->data_offset = 0; | 806 | rdev->data_offset = 0; |
798 | rdev->sb_size = MD_SB_BYTES; | 807 | rdev->sb_size = MD_SB_BYTES; |
799 | 808 | ||
800 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { | ||
801 | if (sb->level != 1 && sb->level != 4 | ||
802 | && sb->level != 5 && sb->level != 6 | ||
803 | && sb->level != 10) { | ||
804 | /* FIXME use a better test */ | ||
805 | printk(KERN_WARNING | ||
806 | "md: bitmaps not supported for this level.\n"); | ||
807 | goto abort; | ||
808 | } | ||
809 | } | ||
810 | |||
811 | if (sb->level == LEVEL_MULTIPATH) | 809 | if (sb->level == LEVEL_MULTIPATH) |
812 | rdev->desc_nr = -1; | 810 | rdev->desc_nr = -1; |
813 | else | 811 | else |
@@ -836,7 +834,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
836 | else | 834 | else |
837 | ret = 0; | 835 | ret = 0; |
838 | } | 836 | } |
839 | rdev->sectors = calc_num_sectors(rdev, sb->chunk_size); | 837 | rdev->sectors = rdev->sb_start; |
840 | 838 | ||
841 | if (rdev->sectors < sb->size * 2 && sb->level > 1) | 839 | if (rdev->sectors < sb->size * 2 && sb->level > 1) |
842 | /* "this cannot possibly happen" ... */ | 840 | /* "this cannot possibly happen" ... */ |
@@ -866,7 +864,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
866 | mddev->minor_version = sb->minor_version; | 864 | mddev->minor_version = sb->minor_version; |
867 | mddev->patch_version = sb->patch_version; | 865 | mddev->patch_version = sb->patch_version; |
868 | mddev->external = 0; | 866 | mddev->external = 0; |
869 | mddev->chunk_size = sb->chunk_size; | 867 | mddev->chunk_sectors = sb->chunk_size >> 9; |
870 | mddev->ctime = sb->ctime; | 868 | mddev->ctime = sb->ctime; |
871 | mddev->utime = sb->utime; | 869 | mddev->utime = sb->utime; |
872 | mddev->level = sb->level; | 870 | mddev->level = sb->level; |
@@ -883,13 +881,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
883 | mddev->delta_disks = sb->delta_disks; | 881 | mddev->delta_disks = sb->delta_disks; |
884 | mddev->new_level = sb->new_level; | 882 | mddev->new_level = sb->new_level; |
885 | mddev->new_layout = sb->new_layout; | 883 | mddev->new_layout = sb->new_layout; |
886 | mddev->new_chunk = sb->new_chunk; | 884 | mddev->new_chunk_sectors = sb->new_chunk >> 9; |
887 | } else { | 885 | } else { |
888 | mddev->reshape_position = MaxSector; | 886 | mddev->reshape_position = MaxSector; |
889 | mddev->delta_disks = 0; | 887 | mddev->delta_disks = 0; |
890 | mddev->new_level = mddev->level; | 888 | mddev->new_level = mddev->level; |
891 | mddev->new_layout = mddev->layout; | 889 | mddev->new_layout = mddev->layout; |
892 | mddev->new_chunk = mddev->chunk_size; | 890 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
893 | } | 891 | } |
894 | 892 | ||
895 | if (sb->state & (1<<MD_SB_CLEAN)) | 893 | if (sb->state & (1<<MD_SB_CLEAN)) |
@@ -1004,7 +1002,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1004 | sb->new_level = mddev->new_level; | 1002 | sb->new_level = mddev->new_level; |
1005 | sb->delta_disks = mddev->delta_disks; | 1003 | sb->delta_disks = mddev->delta_disks; |
1006 | sb->new_layout = mddev->new_layout; | 1004 | sb->new_layout = mddev->new_layout; |
1007 | sb->new_chunk = mddev->new_chunk; | 1005 | sb->new_chunk = mddev->new_chunk_sectors << 9; |
1008 | } | 1006 | } |
1009 | mddev->minor_version = sb->minor_version; | 1007 | mddev->minor_version = sb->minor_version; |
1010 | if (mddev->in_sync) | 1008 | if (mddev->in_sync) |
@@ -1018,7 +1016,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1018 | sb->recovery_cp = 0; | 1016 | sb->recovery_cp = 0; |
1019 | 1017 | ||
1020 | sb->layout = mddev->layout; | 1018 | sb->layout = mddev->layout; |
1021 | sb->chunk_size = mddev->chunk_size; | 1019 | sb->chunk_size = mddev->chunk_sectors << 9; |
1022 | 1020 | ||
1023 | if (mddev->bitmap && mddev->bitmap_file == NULL) | 1021 | if (mddev->bitmap && mddev->bitmap_file == NULL) |
1024 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); | 1022 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); |
@@ -1185,24 +1183,13 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1185 | bdevname(rdev->bdev,b)); | 1183 | bdevname(rdev->bdev,b)); |
1186 | return -EINVAL; | 1184 | return -EINVAL; |
1187 | } | 1185 | } |
1188 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { | ||
1189 | if (sb->level != cpu_to_le32(1) && | ||
1190 | sb->level != cpu_to_le32(4) && | ||
1191 | sb->level != cpu_to_le32(5) && | ||
1192 | sb->level != cpu_to_le32(6) && | ||
1193 | sb->level != cpu_to_le32(10)) { | ||
1194 | printk(KERN_WARNING | ||
1195 | "md: bitmaps not supported for this level.\n"); | ||
1196 | return -EINVAL; | ||
1197 | } | ||
1198 | } | ||
1199 | 1186 | ||
1200 | rdev->preferred_minor = 0xffff; | 1187 | rdev->preferred_minor = 0xffff; |
1201 | rdev->data_offset = le64_to_cpu(sb->data_offset); | 1188 | rdev->data_offset = le64_to_cpu(sb->data_offset); |
1202 | atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); | 1189 | atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); |
1203 | 1190 | ||
1204 | rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; | 1191 | rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; |
1205 | bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; | 1192 | bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; |
1206 | if (rdev->sb_size & bmask) | 1193 | if (rdev->sb_size & bmask) |
1207 | rdev->sb_size = (rdev->sb_size | bmask) + 1; | 1194 | rdev->sb_size = (rdev->sb_size | bmask) + 1; |
1208 | 1195 | ||
@@ -1248,9 +1235,6 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1248 | if (rdev->sectors < le64_to_cpu(sb->data_size)) | 1235 | if (rdev->sectors < le64_to_cpu(sb->data_size)) |
1249 | return -EINVAL; | 1236 | return -EINVAL; |
1250 | rdev->sectors = le64_to_cpu(sb->data_size); | 1237 | rdev->sectors = le64_to_cpu(sb->data_size); |
1251 | if (le32_to_cpu(sb->chunksize)) | ||
1252 | rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1); | ||
1253 | |||
1254 | if (le64_to_cpu(sb->size) > rdev->sectors) | 1238 | if (le64_to_cpu(sb->size) > rdev->sectors) |
1255 | return -EINVAL; | 1239 | return -EINVAL; |
1256 | return ret; | 1240 | return ret; |
@@ -1271,7 +1255,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1271 | mddev->major_version = 1; | 1255 | mddev->major_version = 1; |
1272 | mddev->patch_version = 0; | 1256 | mddev->patch_version = 0; |
1273 | mddev->external = 0; | 1257 | mddev->external = 0; |
1274 | mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; | 1258 | mddev->chunk_sectors = le32_to_cpu(sb->chunksize); |
1275 | mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); | 1259 | mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); |
1276 | mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); | 1260 | mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); |
1277 | mddev->level = le32_to_cpu(sb->level); | 1261 | mddev->level = le32_to_cpu(sb->level); |
@@ -1297,13 +1281,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1297 | mddev->delta_disks = le32_to_cpu(sb->delta_disks); | 1281 | mddev->delta_disks = le32_to_cpu(sb->delta_disks); |
1298 | mddev->new_level = le32_to_cpu(sb->new_level); | 1282 | mddev->new_level = le32_to_cpu(sb->new_level); |
1299 | mddev->new_layout = le32_to_cpu(sb->new_layout); | 1283 | mddev->new_layout = le32_to_cpu(sb->new_layout); |
1300 | mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; | 1284 | mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); |
1301 | } else { | 1285 | } else { |
1302 | mddev->reshape_position = MaxSector; | 1286 | mddev->reshape_position = MaxSector; |
1303 | mddev->delta_disks = 0; | 1287 | mddev->delta_disks = 0; |
1304 | mddev->new_level = mddev->level; | 1288 | mddev->new_level = mddev->level; |
1305 | mddev->new_layout = mddev->layout; | 1289 | mddev->new_layout = mddev->layout; |
1306 | mddev->new_chunk = mddev->chunk_size; | 1290 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
1307 | } | 1291 | } |
1308 | 1292 | ||
1309 | } else if (mddev->pers == NULL) { | 1293 | } else if (mddev->pers == NULL) { |
@@ -1375,7 +1359,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1375 | 1359 | ||
1376 | sb->raid_disks = cpu_to_le32(mddev->raid_disks); | 1360 | sb->raid_disks = cpu_to_le32(mddev->raid_disks); |
1377 | sb->size = cpu_to_le64(mddev->dev_sectors); | 1361 | sb->size = cpu_to_le64(mddev->dev_sectors); |
1378 | sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9); | 1362 | sb->chunksize = cpu_to_le32(mddev->chunk_sectors); |
1379 | sb->level = cpu_to_le32(mddev->level); | 1363 | sb->level = cpu_to_le32(mddev->level); |
1380 | sb->layout = cpu_to_le32(mddev->layout); | 1364 | sb->layout = cpu_to_le32(mddev->layout); |
1381 | 1365 | ||
@@ -1402,7 +1386,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1402 | sb->new_layout = cpu_to_le32(mddev->new_layout); | 1386 | sb->new_layout = cpu_to_le32(mddev->new_layout); |
1403 | sb->delta_disks = cpu_to_le32(mddev->delta_disks); | 1387 | sb->delta_disks = cpu_to_le32(mddev->delta_disks); |
1404 | sb->new_level = cpu_to_le32(mddev->new_level); | 1388 | sb->new_level = cpu_to_le32(mddev->new_level); |
1405 | sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); | 1389 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
1406 | } | 1390 | } |
1407 | 1391 | ||
1408 | max_dev = 0; | 1392 | max_dev = 0; |
@@ -1897,6 +1881,7 @@ static void md_update_sb(mddev_t * mddev, int force_change) | |||
1897 | int sync_req; | 1881 | int sync_req; |
1898 | int nospares = 0; | 1882 | int nospares = 0; |
1899 | 1883 | ||
1884 | mddev->utime = get_seconds(); | ||
1900 | if (mddev->external) | 1885 | if (mddev->external) |
1901 | return; | 1886 | return; |
1902 | repeat: | 1887 | repeat: |
@@ -1926,7 +1911,6 @@ repeat: | |||
1926 | nospares = 0; | 1911 | nospares = 0; |
1927 | 1912 | ||
1928 | sync_req = mddev->in_sync; | 1913 | sync_req = mddev->in_sync; |
1929 | mddev->utime = get_seconds(); | ||
1930 | 1914 | ||
1931 | /* If this is just a dirty<->clean transition, and the array is clean | 1915 | /* If this is just a dirty<->clean transition, and the array is clean |
1932 | * and 'events' is odd, we can roll back to the previous clean state */ | 1916 | * and 'events' is odd, we can roll back to the previous clean state */ |
@@ -2597,15 +2581,6 @@ static void analyze_sbs(mddev_t * mddev) | |||
2597 | clear_bit(In_sync, &rdev->flags); | 2581 | clear_bit(In_sync, &rdev->flags); |
2598 | } | 2582 | } |
2599 | } | 2583 | } |
2600 | |||
2601 | |||
2602 | |||
2603 | if (mddev->recovery_cp != MaxSector && | ||
2604 | mddev->level >= 1) | ||
2605 | printk(KERN_ERR "md: %s: raid array is not clean" | ||
2606 | " -- starting background reconstruction\n", | ||
2607 | mdname(mddev)); | ||
2608 | |||
2609 | } | 2584 | } |
2610 | 2585 | ||
2611 | static void md_safemode_timeout(unsigned long data); | 2586 | static void md_safemode_timeout(unsigned long data); |
@@ -2746,7 +2721,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
2746 | if (IS_ERR(priv)) { | 2721 | if (IS_ERR(priv)) { |
2747 | mddev->new_level = mddev->level; | 2722 | mddev->new_level = mddev->level; |
2748 | mddev->new_layout = mddev->layout; | 2723 | mddev->new_layout = mddev->layout; |
2749 | mddev->new_chunk = mddev->chunk_size; | 2724 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
2750 | mddev->raid_disks -= mddev->delta_disks; | 2725 | mddev->raid_disks -= mddev->delta_disks; |
2751 | mddev->delta_disks = 0; | 2726 | mddev->delta_disks = 0; |
2752 | module_put(pers->owner); | 2727 | module_put(pers->owner); |
@@ -2764,7 +2739,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
2764 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); | 2739 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); |
2765 | mddev->level = mddev->new_level; | 2740 | mddev->level = mddev->new_level; |
2766 | mddev->layout = mddev->new_layout; | 2741 | mddev->layout = mddev->new_layout; |
2767 | mddev->chunk_size = mddev->new_chunk; | 2742 | mddev->chunk_sectors = mddev->new_chunk_sectors; |
2768 | mddev->delta_disks = 0; | 2743 | mddev->delta_disks = 0; |
2769 | pers->run(mddev); | 2744 | pers->run(mddev); |
2770 | mddev_resume(mddev); | 2745 | mddev_resume(mddev); |
@@ -2800,11 +2775,14 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) | |||
2800 | 2775 | ||
2801 | if (mddev->pers) { | 2776 | if (mddev->pers) { |
2802 | int err; | 2777 | int err; |
2803 | if (mddev->pers->reconfig == NULL) | 2778 | if (mddev->pers->check_reshape == NULL) |
2804 | return -EBUSY; | 2779 | return -EBUSY; |
2805 | err = mddev->pers->reconfig(mddev, n, -1); | 2780 | mddev->new_layout = n; |
2806 | if (err) | 2781 | err = mddev->pers->check_reshape(mddev); |
2782 | if (err) { | ||
2783 | mddev->new_layout = mddev->layout; | ||
2807 | return err; | 2784 | return err; |
2785 | } | ||
2808 | } else { | 2786 | } else { |
2809 | mddev->new_layout = n; | 2787 | mddev->new_layout = n; |
2810 | if (mddev->reshape_position == MaxSector) | 2788 | if (mddev->reshape_position == MaxSector) |
@@ -2857,10 +2835,11 @@ static ssize_t | |||
2857 | chunk_size_show(mddev_t *mddev, char *page) | 2835 | chunk_size_show(mddev_t *mddev, char *page) |
2858 | { | 2836 | { |
2859 | if (mddev->reshape_position != MaxSector && | 2837 | if (mddev->reshape_position != MaxSector && |
2860 | mddev->chunk_size != mddev->new_chunk) | 2838 | mddev->chunk_sectors != mddev->new_chunk_sectors) |
2861 | return sprintf(page, "%d (%d)\n", mddev->new_chunk, | 2839 | return sprintf(page, "%d (%d)\n", |
2862 | mddev->chunk_size); | 2840 | mddev->new_chunk_sectors << 9, |
2863 | return sprintf(page, "%d\n", mddev->chunk_size); | 2841 | mddev->chunk_sectors << 9); |
2842 | return sprintf(page, "%d\n", mddev->chunk_sectors << 9); | ||
2864 | } | 2843 | } |
2865 | 2844 | ||
2866 | static ssize_t | 2845 | static ssize_t |
@@ -2874,15 +2853,18 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) | |||
2874 | 2853 | ||
2875 | if (mddev->pers) { | 2854 | if (mddev->pers) { |
2876 | int err; | 2855 | int err; |
2877 | if (mddev->pers->reconfig == NULL) | 2856 | if (mddev->pers->check_reshape == NULL) |
2878 | return -EBUSY; | 2857 | return -EBUSY; |
2879 | err = mddev->pers->reconfig(mddev, -1, n); | 2858 | mddev->new_chunk_sectors = n >> 9; |
2880 | if (err) | 2859 | err = mddev->pers->check_reshape(mddev); |
2860 | if (err) { | ||
2861 | mddev->new_chunk_sectors = mddev->chunk_sectors; | ||
2881 | return err; | 2862 | return err; |
2863 | } | ||
2882 | } else { | 2864 | } else { |
2883 | mddev->new_chunk = n; | 2865 | mddev->new_chunk_sectors = n >> 9; |
2884 | if (mddev->reshape_position == MaxSector) | 2866 | if (mddev->reshape_position == MaxSector) |
2885 | mddev->chunk_size = n; | 2867 | mddev->chunk_sectors = n >> 9; |
2886 | } | 2868 | } |
2887 | return len; | 2869 | return len; |
2888 | } | 2870 | } |
@@ -3527,8 +3509,9 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len) | |||
3527 | return -EBUSY; | 3509 | return -EBUSY; |
3528 | 3510 | ||
3529 | /* Must be a multiple of chunk_size */ | 3511 | /* Must be a multiple of chunk_size */ |
3530 | if (mddev->chunk_size) { | 3512 | if (mddev->chunk_sectors) { |
3531 | if (min & (sector_t)((mddev->chunk_size>>9)-1)) | 3513 | sector_t temp = min; |
3514 | if (sector_div(temp, mddev->chunk_sectors)) | ||
3532 | return -EINVAL; | 3515 | return -EINVAL; |
3533 | } | 3516 | } |
3534 | mddev->resync_min = min; | 3517 | mddev->resync_min = min; |
@@ -3564,8 +3547,9 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) | |||
3564 | return -EBUSY; | 3547 | return -EBUSY; |
3565 | 3548 | ||
3566 | /* Must be a multiple of chunk_size */ | 3549 | /* Must be a multiple of chunk_size */ |
3567 | if (mddev->chunk_size) { | 3550 | if (mddev->chunk_sectors) { |
3568 | if (max & (sector_t)((mddev->chunk_size>>9)-1)) | 3551 | sector_t temp = max; |
3552 | if (sector_div(temp, mddev->chunk_sectors)) | ||
3569 | return -EINVAL; | 3553 | return -EINVAL; |
3570 | } | 3554 | } |
3571 | mddev->resync_max = max; | 3555 | mddev->resync_max = max; |
@@ -3656,7 +3640,7 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len) | |||
3656 | mddev->delta_disks = 0; | 3640 | mddev->delta_disks = 0; |
3657 | mddev->new_level = mddev->level; | 3641 | mddev->new_level = mddev->level; |
3658 | mddev->new_layout = mddev->layout; | 3642 | mddev->new_layout = mddev->layout; |
3659 | mddev->new_chunk = mddev->chunk_size; | 3643 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
3660 | return len; | 3644 | return len; |
3661 | } | 3645 | } |
3662 | 3646 | ||
@@ -3976,11 +3960,9 @@ static int start_dirty_degraded; | |||
3976 | static int do_md_run(mddev_t * mddev) | 3960 | static int do_md_run(mddev_t * mddev) |
3977 | { | 3961 | { |
3978 | int err; | 3962 | int err; |
3979 | int chunk_size; | ||
3980 | mdk_rdev_t *rdev; | 3963 | mdk_rdev_t *rdev; |
3981 | struct gendisk *disk; | 3964 | struct gendisk *disk; |
3982 | struct mdk_personality *pers; | 3965 | struct mdk_personality *pers; |
3983 | char b[BDEVNAME_SIZE]; | ||
3984 | 3966 | ||
3985 | if (list_empty(&mddev->disks)) | 3967 | if (list_empty(&mddev->disks)) |
3986 | /* cannot run an array with no devices.. */ | 3968 | /* cannot run an array with no devices.. */ |
@@ -3998,38 +3980,6 @@ static int do_md_run(mddev_t * mddev) | |||
3998 | analyze_sbs(mddev); | 3980 | analyze_sbs(mddev); |
3999 | } | 3981 | } |
4000 | 3982 | ||
4001 | chunk_size = mddev->chunk_size; | ||
4002 | |||
4003 | if (chunk_size) { | ||
4004 | if (chunk_size > MAX_CHUNK_SIZE) { | ||
4005 | printk(KERN_ERR "too big chunk_size: %d > %d\n", | ||
4006 | chunk_size, MAX_CHUNK_SIZE); | ||
4007 | return -EINVAL; | ||
4008 | } | ||
4009 | /* | ||
4010 | * chunk-size has to be a power of 2 | ||
4011 | */ | ||
4012 | if ( (1 << ffz(~chunk_size)) != chunk_size) { | ||
4013 | printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); | ||
4014 | return -EINVAL; | ||
4015 | } | ||
4016 | |||
4017 | /* devices must have minimum size of one chunk */ | ||
4018 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
4019 | if (test_bit(Faulty, &rdev->flags)) | ||
4020 | continue; | ||
4021 | if (rdev->sectors < chunk_size / 512) { | ||
4022 | printk(KERN_WARNING | ||
4023 | "md: Dev %s smaller than chunk_size:" | ||
4024 | " %llu < %d\n", | ||
4025 | bdevname(rdev->bdev,b), | ||
4026 | (unsigned long long)rdev->sectors, | ||
4027 | chunk_size / 512); | ||
4028 | return -EINVAL; | ||
4029 | } | ||
4030 | } | ||
4031 | } | ||
4032 | |||
4033 | if (mddev->level != LEVEL_NONE) | 3983 | if (mddev->level != LEVEL_NONE) |
4034 | request_module("md-level-%d", mddev->level); | 3984 | request_module("md-level-%d", mddev->level); |
4035 | else if (mddev->clevel[0]) | 3985 | else if (mddev->clevel[0]) |
@@ -4405,7 +4355,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4405 | mddev->flags = 0; | 4355 | mddev->flags = 0; |
4406 | mddev->ro = 0; | 4356 | mddev->ro = 0; |
4407 | mddev->metadata_type[0] = 0; | 4357 | mddev->metadata_type[0] = 0; |
4408 | mddev->chunk_size = 0; | 4358 | mddev->chunk_sectors = 0; |
4409 | mddev->ctime = mddev->utime = 0; | 4359 | mddev->ctime = mddev->utime = 0; |
4410 | mddev->layout = 0; | 4360 | mddev->layout = 0; |
4411 | mddev->max_disks = 0; | 4361 | mddev->max_disks = 0; |
@@ -4413,7 +4363,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4413 | mddev->delta_disks = 0; | 4363 | mddev->delta_disks = 0; |
4414 | mddev->new_level = LEVEL_NONE; | 4364 | mddev->new_level = LEVEL_NONE; |
4415 | mddev->new_layout = 0; | 4365 | mddev->new_layout = 0; |
4416 | mddev->new_chunk = 0; | 4366 | mddev->new_chunk_sectors = 0; |
4417 | mddev->curr_resync = 0; | 4367 | mddev->curr_resync = 0; |
4418 | mddev->resync_mismatches = 0; | 4368 | mddev->resync_mismatches = 0; |
4419 | mddev->suspend_lo = mddev->suspend_hi = 0; | 4369 | mddev->suspend_lo = mddev->suspend_hi = 0; |
@@ -4618,7 +4568,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
4618 | info.spare_disks = spare; | 4568 | info.spare_disks = spare; |
4619 | 4569 | ||
4620 | info.layout = mddev->layout; | 4570 | info.layout = mddev->layout; |
4621 | info.chunk_size = mddev->chunk_size; | 4571 | info.chunk_size = mddev->chunk_sectors << 9; |
4622 | 4572 | ||
4623 | if (copy_to_user(arg, &info, sizeof(info))) | 4573 | if (copy_to_user(arg, &info, sizeof(info))) |
4624 | return -EFAULT; | 4574 | return -EFAULT; |
@@ -4843,7 +4793,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
4843 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; | 4793 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; |
4844 | } else | 4794 | } else |
4845 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 4795 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); |
4846 | rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); | 4796 | rdev->sectors = rdev->sb_start; |
4847 | 4797 | ||
4848 | err = bind_rdev_to_array(rdev, mddev); | 4798 | err = bind_rdev_to_array(rdev, mddev); |
4849 | if (err) { | 4799 | if (err) { |
@@ -4913,7 +4863,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) | |||
4913 | else | 4863 | else |
4914 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; | 4864 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; |
4915 | 4865 | ||
4916 | rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); | 4866 | rdev->sectors = rdev->sb_start; |
4917 | 4867 | ||
4918 | if (test_bit(Faulty, &rdev->flags)) { | 4868 | if (test_bit(Faulty, &rdev->flags)) { |
4919 | printk(KERN_WARNING | 4869 | printk(KERN_WARNING |
@@ -5062,7 +5012,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | |||
5062 | mddev->external = 0; | 5012 | mddev->external = 0; |
5063 | 5013 | ||
5064 | mddev->layout = info->layout; | 5014 | mddev->layout = info->layout; |
5065 | mddev->chunk_size = info->chunk_size; | 5015 | mddev->chunk_sectors = info->chunk_size >> 9; |
5066 | 5016 | ||
5067 | mddev->max_disks = MD_SB_DISKS; | 5017 | mddev->max_disks = MD_SB_DISKS; |
5068 | 5018 | ||
@@ -5081,7 +5031,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | |||
5081 | get_random_bytes(mddev->uuid, 16); | 5031 | get_random_bytes(mddev->uuid, 16); |
5082 | 5032 | ||
5083 | mddev->new_level = mddev->level; | 5033 | mddev->new_level = mddev->level; |
5084 | mddev->new_chunk = mddev->chunk_size; | 5034 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
5085 | mddev->new_layout = mddev->layout; | 5035 | mddev->new_layout = mddev->layout; |
5086 | mddev->delta_disks = 0; | 5036 | mddev->delta_disks = 0; |
5087 | 5037 | ||
@@ -5191,7 +5141,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
5191 | mddev->level != info->level || | 5141 | mddev->level != info->level || |
5192 | /* mddev->layout != info->layout || */ | 5142 | /* mddev->layout != info->layout || */ |
5193 | !mddev->persistent != info->not_persistent|| | 5143 | !mddev->persistent != info->not_persistent|| |
5194 | mddev->chunk_size != info->chunk_size || | 5144 | mddev->chunk_sectors != info->chunk_size >> 9 || |
5195 | /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ | 5145 | /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ |
5196 | ((state^info->state) & 0xfffffe00) | 5146 | ((state^info->state) & 0xfffffe00) |
5197 | ) | 5147 | ) |
@@ -5215,10 +5165,15 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
5215 | * we don't need to do anything at the md level, the | 5165 | * we don't need to do anything at the md level, the |
5216 | * personality will take care of it all. | 5166 | * personality will take care of it all. |
5217 | */ | 5167 | */ |
5218 | if (mddev->pers->reconfig == NULL) | 5168 | if (mddev->pers->check_reshape == NULL) |
5219 | return -EINVAL; | 5169 | return -EINVAL; |
5220 | else | 5170 | else { |
5221 | return mddev->pers->reconfig(mddev, info->layout, -1); | 5171 | mddev->new_layout = info->layout; |
5172 | rv = mddev->pers->check_reshape(mddev); | ||
5173 | if (rv) | ||
5174 | mddev->new_layout = mddev->layout; | ||
5175 | return rv; | ||
5176 | } | ||
5222 | } | 5177 | } |
5223 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) | 5178 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) |
5224 | rv = update_size(mddev, (sector_t)info->size * 2); | 5179 | rv = update_size(mddev, (sector_t)info->size * 2); |
@@ -6717,7 +6672,8 @@ void md_check_recovery(mddev_t *mddev) | |||
6717 | */ | 6672 | */ |
6718 | 6673 | ||
6719 | if (mddev->reshape_position != MaxSector) { | 6674 | if (mddev->reshape_position != MaxSector) { |
6720 | if (mddev->pers->check_reshape(mddev) != 0) | 6675 | if (mddev->pers->check_reshape == NULL || |
6676 | mddev->pers->check_reshape(mddev) != 0) | ||
6721 | /* Cannot proceed */ | 6677 | /* Cannot proceed */ |
6722 | goto unlock; | 6678 | goto unlock; |
6723 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | 6679 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 8227ab909d44..9430a110db93 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -30,13 +30,6 @@ typedef struct mddev_s mddev_t; | |||
30 | typedef struct mdk_rdev_s mdk_rdev_t; | 30 | typedef struct mdk_rdev_s mdk_rdev_t; |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * options passed in raidrun: | ||
34 | */ | ||
35 | |||
36 | /* Currently this must fit in an 'int' */ | ||
37 | #define MAX_CHUNK_SIZE (1<<30) | ||
38 | |||
39 | /* | ||
40 | * MD's 'extended' device | 33 | * MD's 'extended' device |
41 | */ | 34 | */ |
42 | struct mdk_rdev_s | 35 | struct mdk_rdev_s |
@@ -145,7 +138,7 @@ struct mddev_s | |||
145 | int external; /* metadata is | 138 | int external; /* metadata is |
146 | * managed externally */ | 139 | * managed externally */ |
147 | char metadata_type[17]; /* externally set*/ | 140 | char metadata_type[17]; /* externally set*/ |
148 | int chunk_size; | 141 | int chunk_sectors; |
149 | time_t ctime, utime; | 142 | time_t ctime, utime; |
150 | int level, layout; | 143 | int level, layout; |
151 | char clevel[16]; | 144 | char clevel[16]; |
@@ -166,7 +159,8 @@ struct mddev_s | |||
166 | * If reshape_position is MaxSector, then no reshape is happening (yet). | 159 | * If reshape_position is MaxSector, then no reshape is happening (yet). |
167 | */ | 160 | */ |
168 | sector_t reshape_position; | 161 | sector_t reshape_position; |
169 | int delta_disks, new_level, new_layout, new_chunk; | 162 | int delta_disks, new_level, new_layout; |
163 | int new_chunk_sectors; | ||
170 | 164 | ||
171 | struct mdk_thread_s *thread; /* management thread */ | 165 | struct mdk_thread_s *thread; /* management thread */ |
172 | struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ | 166 | struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ |
@@ -325,7 +319,6 @@ struct mdk_personality | |||
325 | int (*check_reshape) (mddev_t *mddev); | 319 | int (*check_reshape) (mddev_t *mddev); |
326 | int (*start_reshape) (mddev_t *mddev); | 320 | int (*start_reshape) (mddev_t *mddev); |
327 | void (*finish_reshape) (mddev_t *mddev); | 321 | void (*finish_reshape) (mddev_t *mddev); |
328 | int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); | ||
329 | /* quiesce moves between quiescence states | 322 | /* quiesce moves between quiescence states |
330 | * 0 - fully active | 323 | * 0 - fully active |
331 | * 1 - no new requests allowed | 324 | * 1 - no new requests allowed |
@@ -437,5 +430,6 @@ extern void md_new_event(mddev_t *mddev); | |||
437 | extern int md_allow_write(mddev_t *mddev); | 430 | extern int md_allow_write(mddev_t *mddev); |
438 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | 431 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); |
439 | extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); | 432 | extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); |
433 | extern int md_check_no_bitmap(mddev_t *mddev); | ||
440 | 434 | ||
441 | #endif /* _MD_MD_H */ | 435 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 41ced0cbe823..cbe368fa6598 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -58,7 +58,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) | |||
58 | { | 58 | { |
59 | unsigned long flags; | 59 | unsigned long flags; |
60 | mddev_t *mddev = mp_bh->mddev; | 60 | mddev_t *mddev = mp_bh->mddev; |
61 | multipath_conf_t *conf = mddev_to_conf(mddev); | 61 | multipath_conf_t *conf = mddev->private; |
62 | 62 | ||
63 | spin_lock_irqsave(&conf->device_lock, flags); | 63 | spin_lock_irqsave(&conf->device_lock, flags); |
64 | list_add(&mp_bh->retry_list, &conf->retry_list); | 64 | list_add(&mp_bh->retry_list, &conf->retry_list); |
@@ -75,7 +75,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) | |||
75 | static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) | 75 | static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) |
76 | { | 76 | { |
77 | struct bio *bio = mp_bh->master_bio; | 77 | struct bio *bio = mp_bh->master_bio; |
78 | multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); | 78 | multipath_conf_t *conf = mp_bh->mddev->private; |
79 | 79 | ||
80 | bio_endio(bio, err); | 80 | bio_endio(bio, err); |
81 | mempool_free(mp_bh, conf->pool); | 81 | mempool_free(mp_bh, conf->pool); |
@@ -85,7 +85,7 @@ static void multipath_end_request(struct bio *bio, int error) | |||
85 | { | 85 | { |
86 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 86 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
87 | struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); | 87 | struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); |
88 | multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); | 88 | multipath_conf_t *conf = mp_bh->mddev->private; |
89 | mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; | 89 | mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; |
90 | 90 | ||
91 | if (uptodate) | 91 | if (uptodate) |
@@ -107,7 +107,7 @@ static void multipath_end_request(struct bio *bio, int error) | |||
107 | 107 | ||
108 | static void unplug_slaves(mddev_t *mddev) | 108 | static void unplug_slaves(mddev_t *mddev) |
109 | { | 109 | { |
110 | multipath_conf_t *conf = mddev_to_conf(mddev); | 110 | multipath_conf_t *conf = mddev->private; |
111 | int i; | 111 | int i; |
112 | 112 | ||
113 | rcu_read_lock(); | 113 | rcu_read_lock(); |
@@ -138,7 +138,7 @@ static void multipath_unplug(struct request_queue *q) | |||
138 | static int multipath_make_request (struct request_queue *q, struct bio * bio) | 138 | static int multipath_make_request (struct request_queue *q, struct bio * bio) |
139 | { | 139 | { |
140 | mddev_t *mddev = q->queuedata; | 140 | mddev_t *mddev = q->queuedata; |
141 | multipath_conf_t *conf = mddev_to_conf(mddev); | 141 | multipath_conf_t *conf = mddev->private; |
142 | struct multipath_bh * mp_bh; | 142 | struct multipath_bh * mp_bh; |
143 | struct multipath_info *multipath; | 143 | struct multipath_info *multipath; |
144 | const int rw = bio_data_dir(bio); | 144 | const int rw = bio_data_dir(bio); |
@@ -180,7 +180,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) | |||
180 | 180 | ||
181 | static void multipath_status (struct seq_file *seq, mddev_t *mddev) | 181 | static void multipath_status (struct seq_file *seq, mddev_t *mddev) |
182 | { | 182 | { |
183 | multipath_conf_t *conf = mddev_to_conf(mddev); | 183 | multipath_conf_t *conf = mddev->private; |
184 | int i; | 184 | int i; |
185 | 185 | ||
186 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, | 186 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, |
@@ -195,7 +195,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) | |||
195 | static int multipath_congested(void *data, int bits) | 195 | static int multipath_congested(void *data, int bits) |
196 | { | 196 | { |
197 | mddev_t *mddev = data; | 197 | mddev_t *mddev = data; |
198 | multipath_conf_t *conf = mddev_to_conf(mddev); | 198 | multipath_conf_t *conf = mddev->private; |
199 | int i, ret = 0; | 199 | int i, ret = 0; |
200 | 200 | ||
201 | rcu_read_lock(); | 201 | rcu_read_lock(); |
@@ -220,7 +220,7 @@ static int multipath_congested(void *data, int bits) | |||
220 | */ | 220 | */ |
221 | static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) | 221 | static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) |
222 | { | 222 | { |
223 | multipath_conf_t *conf = mddev_to_conf(mddev); | 223 | multipath_conf_t *conf = mddev->private; |
224 | 224 | ||
225 | if (conf->working_disks <= 1) { | 225 | if (conf->working_disks <= 1) { |
226 | /* | 226 | /* |
@@ -303,7 +303,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
303 | * merge_bvec_fn will be involved in multipath.) | 303 | * merge_bvec_fn will be involved in multipath.) |
304 | */ | 304 | */ |
305 | if (q->merge_bvec_fn && | 305 | if (q->merge_bvec_fn && |
306 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 306 | queue_max_sectors(q) > (PAGE_SIZE>>9)) |
307 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 307 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
308 | 308 | ||
309 | conf->working_disks++; | 309 | conf->working_disks++; |
@@ -367,7 +367,7 @@ static void multipathd (mddev_t *mddev) | |||
367 | struct multipath_bh *mp_bh; | 367 | struct multipath_bh *mp_bh; |
368 | struct bio *bio; | 368 | struct bio *bio; |
369 | unsigned long flags; | 369 | unsigned long flags; |
370 | multipath_conf_t *conf = mddev_to_conf(mddev); | 370 | multipath_conf_t *conf = mddev->private; |
371 | struct list_head *head = &conf->retry_list; | 371 | struct list_head *head = &conf->retry_list; |
372 | 372 | ||
373 | md_check_recovery(mddev); | 373 | md_check_recovery(mddev); |
@@ -421,6 +421,9 @@ static int multipath_run (mddev_t *mddev) | |||
421 | struct multipath_info *disk; | 421 | struct multipath_info *disk; |
422 | mdk_rdev_t *rdev; | 422 | mdk_rdev_t *rdev; |
423 | 423 | ||
424 | if (md_check_no_bitmap(mddev)) | ||
425 | return -EINVAL; | ||
426 | |||
424 | if (mddev->level != LEVEL_MULTIPATH) { | 427 | if (mddev->level != LEVEL_MULTIPATH) { |
425 | printk("multipath: %s: raid level not set to multipath IO (%d)\n", | 428 | printk("multipath: %s: raid level not set to multipath IO (%d)\n", |
426 | mdname(mddev), mddev->level); | 429 | mdname(mddev), mddev->level); |
@@ -467,7 +470,7 @@ static int multipath_run (mddev_t *mddev) | |||
467 | * violating it, not that we ever expect a device with | 470 | * violating it, not that we ever expect a device with |
468 | * a merge_bvec_fn to be involved in multipath */ | 471 | * a merge_bvec_fn to be involved in multipath */ |
469 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 472 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && |
470 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 473 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
471 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 474 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
472 | 475 | ||
473 | if (!test_bit(Faulty, &rdev->flags)) | 476 | if (!test_bit(Faulty, &rdev->flags)) |
@@ -531,7 +534,7 @@ out: | |||
531 | 534 | ||
532 | static int multipath_stop (mddev_t *mddev) | 535 | static int multipath_stop (mddev_t *mddev) |
533 | { | 536 | { |
534 | multipath_conf_t *conf = mddev_to_conf(mddev); | 537 | multipath_conf_t *conf = mddev->private; |
535 | 538 | ||
536 | md_unregister_thread(mddev->thread); | 539 | md_unregister_thread(mddev->thread); |
537 | mddev->thread = NULL; | 540 | mddev->thread = NULL; |
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h index 6fa70b400cda..d1c2a8d78395 100644 --- a/drivers/md/multipath.h +++ b/drivers/md/multipath.h | |||
@@ -19,12 +19,6 @@ struct multipath_private_data { | |||
19 | typedef struct multipath_private_data multipath_conf_t; | 19 | typedef struct multipath_private_data multipath_conf_t; |
20 | 20 | ||
21 | /* | 21 | /* |
22 | * this is the only point in the RAID code where we violate | ||
23 | * C type safety. mddev->private is an 'opaque' pointer. | ||
24 | */ | ||
25 | #define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private) | ||
26 | |||
27 | /* | ||
28 | * this is our 'private' 'collective' MULTIPATH buffer head. | 22 | * this is our 'private' 'collective' MULTIPATH buffer head. |
29 | * it contains information about what kind of IO operations were started | 23 | * it contains information about what kind of IO operations were started |
30 | * for this MULTIPATH operation, and about their status: | 24 | * for this MULTIPATH operation, and about their status: |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c08d7559be55..ab4a489d8695 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -26,8 +26,8 @@ | |||
26 | static void raid0_unplug(struct request_queue *q) | 26 | static void raid0_unplug(struct request_queue *q) |
27 | { | 27 | { |
28 | mddev_t *mddev = q->queuedata; | 28 | mddev_t *mddev = q->queuedata; |
29 | raid0_conf_t *conf = mddev_to_conf(mddev); | 29 | raid0_conf_t *conf = mddev->private; |
30 | mdk_rdev_t **devlist = conf->strip_zone[0].dev; | 30 | mdk_rdev_t **devlist = conf->devlist; |
31 | int i; | 31 | int i; |
32 | 32 | ||
33 | for (i=0; i<mddev->raid_disks; i++) { | 33 | for (i=0; i<mddev->raid_disks; i++) { |
@@ -40,8 +40,8 @@ static void raid0_unplug(struct request_queue *q) | |||
40 | static int raid0_congested(void *data, int bits) | 40 | static int raid0_congested(void *data, int bits) |
41 | { | 41 | { |
42 | mddev_t *mddev = data; | 42 | mddev_t *mddev = data; |
43 | raid0_conf_t *conf = mddev_to_conf(mddev); | 43 | raid0_conf_t *conf = mddev->private; |
44 | mdk_rdev_t **devlist = conf->strip_zone[0].dev; | 44 | mdk_rdev_t **devlist = conf->devlist; |
45 | int i, ret = 0; | 45 | int i, ret = 0; |
46 | 46 | ||
47 | for (i = 0; i < mddev->raid_disks && !ret ; i++) { | 47 | for (i = 0; i < mddev->raid_disks && !ret ; i++) { |
@@ -52,27 +52,60 @@ static int raid0_congested(void *data, int bits) | |||
52 | return ret; | 52 | return ret; |
53 | } | 53 | } |
54 | 54 | ||
55 | /* | ||
56 | * inform the user of the raid configuration | ||
57 | */ | ||
58 | static void dump_zones(mddev_t *mddev) | ||
59 | { | ||
60 | int j, k, h; | ||
61 | sector_t zone_size = 0; | ||
62 | sector_t zone_start = 0; | ||
63 | char b[BDEVNAME_SIZE]; | ||
64 | raid0_conf_t *conf = mddev->private; | ||
65 | printk(KERN_INFO "******* %s configuration *********\n", | ||
66 | mdname(mddev)); | ||
67 | h = 0; | ||
68 | for (j = 0; j < conf->nr_strip_zones; j++) { | ||
69 | printk(KERN_INFO "zone%d=[", j); | ||
70 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) | ||
71 | printk("%s/", | ||
72 | bdevname(conf->devlist[j*mddev->raid_disks | ||
73 | + k]->bdev, b)); | ||
74 | printk("]\n"); | ||
75 | |||
76 | zone_size = conf->strip_zone[j].zone_end - zone_start; | ||
77 | printk(KERN_INFO " zone offset=%llukb " | ||
78 | "device offset=%llukb size=%llukb\n", | ||
79 | (unsigned long long)zone_start>>1, | ||
80 | (unsigned long long)conf->strip_zone[j].dev_start>>1, | ||
81 | (unsigned long long)zone_size>>1); | ||
82 | zone_start = conf->strip_zone[j].zone_end; | ||
83 | } | ||
84 | printk(KERN_INFO "**********************************\n\n"); | ||
85 | } | ||
55 | 86 | ||
56 | static int create_strip_zones (mddev_t *mddev) | 87 | static int create_strip_zones(mddev_t *mddev) |
57 | { | 88 | { |
58 | int i, c, j; | 89 | int i, c, j, err; |
59 | sector_t current_start, curr_zone_start; | 90 | sector_t curr_zone_end, sectors; |
60 | sector_t min_spacing; | 91 | mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev; |
61 | raid0_conf_t *conf = mddev_to_conf(mddev); | ||
62 | mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; | ||
63 | struct strip_zone *zone; | 92 | struct strip_zone *zone; |
64 | int cnt; | 93 | int cnt; |
65 | char b[BDEVNAME_SIZE]; | 94 | char b[BDEVNAME_SIZE]; |
66 | 95 | raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL); | |
67 | /* | 96 | |
68 | * The number of 'same size groups' | 97 | if (!conf) |
69 | */ | 98 | return -ENOMEM; |
70 | conf->nr_strip_zones = 0; | ||
71 | |||
72 | list_for_each_entry(rdev1, &mddev->disks, same_set) { | 99 | list_for_each_entry(rdev1, &mddev->disks, same_set) { |
73 | printk(KERN_INFO "raid0: looking at %s\n", | 100 | printk(KERN_INFO "raid0: looking at %s\n", |
74 | bdevname(rdev1->bdev,b)); | 101 | bdevname(rdev1->bdev,b)); |
75 | c = 0; | 102 | c = 0; |
103 | |||
104 | /* round size to chunk_size */ | ||
105 | sectors = rdev1->sectors; | ||
106 | sector_div(sectors, mddev->chunk_sectors); | ||
107 | rdev1->sectors = sectors * mddev->chunk_sectors; | ||
108 | |||
76 | list_for_each_entry(rdev2, &mddev->disks, same_set) { | 109 | list_for_each_entry(rdev2, &mddev->disks, same_set) { |
77 | printk(KERN_INFO "raid0: comparing %s(%llu)", | 110 | printk(KERN_INFO "raid0: comparing %s(%llu)", |
78 | bdevname(rdev1->bdev,b), | 111 | bdevname(rdev1->bdev,b), |
@@ -103,16 +136,16 @@ static int create_strip_zones (mddev_t *mddev) | |||
103 | } | 136 | } |
104 | } | 137 | } |
105 | printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); | 138 | printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); |
106 | 139 | err = -ENOMEM; | |
107 | conf->strip_zone = kzalloc(sizeof(struct strip_zone)* | 140 | conf->strip_zone = kzalloc(sizeof(struct strip_zone)* |
108 | conf->nr_strip_zones, GFP_KERNEL); | 141 | conf->nr_strip_zones, GFP_KERNEL); |
109 | if (!conf->strip_zone) | 142 | if (!conf->strip_zone) |
110 | return 1; | 143 | goto abort; |
111 | conf->devlist = kzalloc(sizeof(mdk_rdev_t*)* | 144 | conf->devlist = kzalloc(sizeof(mdk_rdev_t*)* |
112 | conf->nr_strip_zones*mddev->raid_disks, | 145 | conf->nr_strip_zones*mddev->raid_disks, |
113 | GFP_KERNEL); | 146 | GFP_KERNEL); |
114 | if (!conf->devlist) | 147 | if (!conf->devlist) |
115 | return 1; | 148 | goto abort; |
116 | 149 | ||
117 | /* The first zone must contain all devices, so here we check that | 150 | /* The first zone must contain all devices, so here we check that |
118 | * there is a proper alignment of slots to devices and find them all | 151 | * there is a proper alignment of slots to devices and find them all |
@@ -120,7 +153,8 @@ static int create_strip_zones (mddev_t *mddev) | |||
120 | zone = &conf->strip_zone[0]; | 153 | zone = &conf->strip_zone[0]; |
121 | cnt = 0; | 154 | cnt = 0; |
122 | smallest = NULL; | 155 | smallest = NULL; |
123 | zone->dev = conf->devlist; | 156 | dev = conf->devlist; |
157 | err = -EINVAL; | ||
124 | list_for_each_entry(rdev1, &mddev->disks, same_set) { | 158 | list_for_each_entry(rdev1, &mddev->disks, same_set) { |
125 | int j = rdev1->raid_disk; | 159 | int j = rdev1->raid_disk; |
126 | 160 | ||
@@ -129,12 +163,12 @@ static int create_strip_zones (mddev_t *mddev) | |||
129 | "aborting!\n", j); | 163 | "aborting!\n", j); |
130 | goto abort; | 164 | goto abort; |
131 | } | 165 | } |
132 | if (zone->dev[j]) { | 166 | if (dev[j]) { |
133 | printk(KERN_ERR "raid0: multiple devices for %d - " | 167 | printk(KERN_ERR "raid0: multiple devices for %d - " |
134 | "aborting!\n", j); | 168 | "aborting!\n", j); |
135 | goto abort; | 169 | goto abort; |
136 | } | 170 | } |
137 | zone->dev[j] = rdev1; | 171 | dev[j] = rdev1; |
138 | 172 | ||
139 | blk_queue_stack_limits(mddev->queue, | 173 | blk_queue_stack_limits(mddev->queue, |
140 | rdev1->bdev->bd_disk->queue); | 174 | rdev1->bdev->bd_disk->queue); |
@@ -144,7 +178,7 @@ static int create_strip_zones (mddev_t *mddev) | |||
144 | */ | 178 | */ |
145 | 179 | ||
146 | if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && | 180 | if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && |
147 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 181 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
148 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 182 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
149 | 183 | ||
150 | if (!smallest || (rdev1->sectors < smallest->sectors)) | 184 | if (!smallest || (rdev1->sectors < smallest->sectors)) |
@@ -157,34 +191,32 @@ static int create_strip_zones (mddev_t *mddev) | |||
157 | goto abort; | 191 | goto abort; |
158 | } | 192 | } |
159 | zone->nb_dev = cnt; | 193 | zone->nb_dev = cnt; |
160 | zone->sectors = smallest->sectors * cnt; | 194 | zone->zone_end = smallest->sectors * cnt; |
161 | zone->zone_start = 0; | ||
162 | 195 | ||
163 | current_start = smallest->sectors; | 196 | curr_zone_end = zone->zone_end; |
164 | curr_zone_start = zone->sectors; | ||
165 | 197 | ||
166 | /* now do the other zones */ | 198 | /* now do the other zones */ |
167 | for (i = 1; i < conf->nr_strip_zones; i++) | 199 | for (i = 1; i < conf->nr_strip_zones; i++) |
168 | { | 200 | { |
169 | zone = conf->strip_zone + i; | 201 | zone = conf->strip_zone + i; |
170 | zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks; | 202 | dev = conf->devlist + i * mddev->raid_disks; |
171 | 203 | ||
172 | printk(KERN_INFO "raid0: zone %d\n", i); | 204 | printk(KERN_INFO "raid0: zone %d\n", i); |
173 | zone->dev_start = current_start; | 205 | zone->dev_start = smallest->sectors; |
174 | smallest = NULL; | 206 | smallest = NULL; |
175 | c = 0; | 207 | c = 0; |
176 | 208 | ||
177 | for (j=0; j<cnt; j++) { | 209 | for (j=0; j<cnt; j++) { |
178 | char b[BDEVNAME_SIZE]; | 210 | char b[BDEVNAME_SIZE]; |
179 | rdev = conf->strip_zone[0].dev[j]; | 211 | rdev = conf->devlist[j]; |
180 | printk(KERN_INFO "raid0: checking %s ...", | 212 | printk(KERN_INFO "raid0: checking %s ...", |
181 | bdevname(rdev->bdev, b)); | 213 | bdevname(rdev->bdev, b)); |
182 | if (rdev->sectors <= current_start) { | 214 | if (rdev->sectors <= zone->dev_start) { |
183 | printk(KERN_INFO " nope.\n"); | 215 | printk(KERN_INFO " nope.\n"); |
184 | continue; | 216 | continue; |
185 | } | 217 | } |
186 | printk(KERN_INFO " contained as device %d\n", c); | 218 | printk(KERN_INFO " contained as device %d\n", c); |
187 | zone->dev[c] = rdev; | 219 | dev[c] = rdev; |
188 | c++; | 220 | c++; |
189 | if (!smallest || rdev->sectors < smallest->sectors) { | 221 | if (!smallest || rdev->sectors < smallest->sectors) { |
190 | smallest = rdev; | 222 | smallest = rdev; |
@@ -194,47 +226,39 @@ static int create_strip_zones (mddev_t *mddev) | |||
194 | } | 226 | } |
195 | 227 | ||
196 | zone->nb_dev = c; | 228 | zone->nb_dev = c; |
197 | zone->sectors = (smallest->sectors - current_start) * c; | 229 | sectors = (smallest->sectors - zone->dev_start) * c; |
198 | printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", | 230 | printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", |
199 | zone->nb_dev, (unsigned long long)zone->sectors); | 231 | zone->nb_dev, (unsigned long long)sectors); |
200 | 232 | ||
201 | zone->zone_start = curr_zone_start; | 233 | curr_zone_end += sectors; |
202 | curr_zone_start += zone->sectors; | 234 | zone->zone_end = curr_zone_end; |
203 | 235 | ||
204 | current_start = smallest->sectors; | ||
205 | printk(KERN_INFO "raid0: current zone start: %llu\n", | 236 | printk(KERN_INFO "raid0: current zone start: %llu\n", |
206 | (unsigned long long)current_start); | 237 | (unsigned long long)smallest->sectors); |
207 | } | ||
208 | |||
209 | /* Now find appropriate hash spacing. | ||
210 | * We want a number which causes most hash entries to cover | ||
211 | * at most two strips, but the hash table must be at most | ||
212 | * 1 PAGE. We choose the smallest strip, or contiguous collection | ||
213 | * of strips, that has big enough size. We never consider the last | ||
214 | * strip though as it's size has no bearing on the efficacy of the hash | ||
215 | * table. | ||
216 | */ | ||
217 | conf->spacing = curr_zone_start; | ||
218 | min_spacing = curr_zone_start; | ||
219 | sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*)); | ||
220 | for (i=0; i < conf->nr_strip_zones-1; i++) { | ||
221 | sector_t s = 0; | ||
222 | for (j = i; j < conf->nr_strip_zones - 1 && | ||
223 | s < min_spacing; j++) | ||
224 | s += conf->strip_zone[j].sectors; | ||
225 | if (s >= min_spacing && s < conf->spacing) | ||
226 | conf->spacing = s; | ||
227 | } | 238 | } |
228 | |||
229 | mddev->queue->unplug_fn = raid0_unplug; | 239 | mddev->queue->unplug_fn = raid0_unplug; |
230 | |||
231 | mddev->queue->backing_dev_info.congested_fn = raid0_congested; | 240 | mddev->queue->backing_dev_info.congested_fn = raid0_congested; |
232 | mddev->queue->backing_dev_info.congested_data = mddev; | 241 | mddev->queue->backing_dev_info.congested_data = mddev; |
233 | 242 | ||
243 | /* | ||
244 | * now since we have the hard sector sizes, we can make sure | ||
245 | * chunk size is a multiple of that sector size | ||
246 | */ | ||
247 | if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { | ||
248 | printk(KERN_ERR "%s chunk_size of %d not valid\n", | ||
249 | mdname(mddev), | ||
250 | mddev->chunk_sectors << 9); | ||
251 | goto abort; | ||
252 | } | ||
234 | printk(KERN_INFO "raid0: done.\n"); | 253 | printk(KERN_INFO "raid0: done.\n"); |
254 | mddev->private = conf; | ||
235 | return 0; | 255 | return 0; |
236 | abort: | 256 | abort: |
237 | return 1; | 257 | kfree(conf->strip_zone); |
258 | kfree(conf->devlist); | ||
259 | kfree(conf); | ||
260 | mddev->private = NULL; | ||
261 | return err; | ||
238 | } | 262 | } |
239 | 263 | ||
240 | /** | 264 | /** |
@@ -252,10 +276,15 @@ static int raid0_mergeable_bvec(struct request_queue *q, | |||
252 | mddev_t *mddev = q->queuedata; | 276 | mddev_t *mddev = q->queuedata; |
253 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 277 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
254 | int max; | 278 | int max; |
255 | unsigned int chunk_sectors = mddev->chunk_size >> 9; | 279 | unsigned int chunk_sectors = mddev->chunk_sectors; |
256 | unsigned int bio_sectors = bvm->bi_size >> 9; | 280 | unsigned int bio_sectors = bvm->bi_size >> 9; |
257 | 281 | ||
258 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; | 282 | if (is_power_of_2(chunk_sectors)) |
283 | max = (chunk_sectors - ((sector & (chunk_sectors-1)) | ||
284 | + bio_sectors)) << 9; | ||
285 | else | ||
286 | max = (chunk_sectors - (sector_div(sector, chunk_sectors) | ||
287 | + bio_sectors)) << 9; | ||
259 | if (max < 0) max = 0; /* bio_add cannot handle a negative return */ | 288 | if (max < 0) max = 0; /* bio_add cannot handle a negative return */ |
260 | if (max <= biovec->bv_len && bio_sectors == 0) | 289 | if (max <= biovec->bv_len && bio_sectors == 0) |
261 | return biovec->bv_len; | 290 | return biovec->bv_len; |
@@ -277,84 +306,28 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
277 | return array_sectors; | 306 | return array_sectors; |
278 | } | 307 | } |
279 | 308 | ||
280 | static int raid0_run (mddev_t *mddev) | 309 | static int raid0_run(mddev_t *mddev) |
281 | { | 310 | { |
282 | unsigned cur=0, i=0, nb_zone; | 311 | int ret; |
283 | s64 sectors; | ||
284 | raid0_conf_t *conf; | ||
285 | 312 | ||
286 | if (mddev->chunk_size == 0) { | 313 | if (mddev->chunk_sectors == 0) { |
287 | printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); | 314 | printk(KERN_ERR "md/raid0: chunk size must be set.\n"); |
288 | return -EINVAL; | 315 | return -EINVAL; |
289 | } | 316 | } |
290 | printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n", | 317 | if (md_check_no_bitmap(mddev)) |
291 | mdname(mddev), | 318 | return -EINVAL; |
292 | mddev->chunk_size >> 9, | 319 | blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors); |
293 | (mddev->chunk_size>>1)-1); | ||
294 | blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9); | ||
295 | blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1); | ||
296 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; | 320 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; |
297 | 321 | ||
298 | conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL); | 322 | ret = create_strip_zones(mddev); |
299 | if (!conf) | 323 | if (ret < 0) |
300 | goto out; | 324 | return ret; |
301 | mddev->private = (void *)conf; | ||
302 | |||
303 | conf->strip_zone = NULL; | ||
304 | conf->devlist = NULL; | ||
305 | if (create_strip_zones (mddev)) | ||
306 | goto out_free_conf; | ||
307 | 325 | ||
308 | /* calculate array device size */ | 326 | /* calculate array device size */ |
309 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); | 327 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); |
310 | 328 | ||
311 | printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", | 329 | printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", |
312 | (unsigned long long)mddev->array_sectors); | 330 | (unsigned long long)mddev->array_sectors); |
313 | printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n", | ||
314 | (unsigned long long)conf->spacing); | ||
315 | { | ||
316 | sector_t s = raid0_size(mddev, 0, 0); | ||
317 | sector_t space = conf->spacing; | ||
318 | int round; | ||
319 | conf->sector_shift = 0; | ||
320 | if (sizeof(sector_t) > sizeof(u32)) { | ||
321 | /*shift down space and s so that sector_div will work */ | ||
322 | while (space > (sector_t) (~(u32)0)) { | ||
323 | s >>= 1; | ||
324 | space >>= 1; | ||
325 | s += 1; /* force round-up */ | ||
326 | conf->sector_shift++; | ||
327 | } | ||
328 | } | ||
329 | round = sector_div(s, (u32)space) ? 1 : 0; | ||
330 | nb_zone = s + round; | ||
331 | } | ||
332 | printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone); | ||
333 | |||
334 | printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n", | ||
335 | nb_zone*sizeof(struct strip_zone*)); | ||
336 | conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL); | ||
337 | if (!conf->hash_table) | ||
338 | goto out_free_conf; | ||
339 | sectors = conf->strip_zone[cur].sectors; | ||
340 | |||
341 | conf->hash_table[0] = conf->strip_zone + cur; | ||
342 | for (i=1; i< nb_zone; i++) { | ||
343 | while (sectors <= conf->spacing) { | ||
344 | cur++; | ||
345 | sectors += conf->strip_zone[cur].sectors; | ||
346 | } | ||
347 | sectors -= conf->spacing; | ||
348 | conf->hash_table[i] = conf->strip_zone + cur; | ||
349 | } | ||
350 | if (conf->sector_shift) { | ||
351 | conf->spacing >>= conf->sector_shift; | ||
352 | /* round spacing up so when we divide by it, we | ||
353 | * err on the side of too-low, which is safest | ||
354 | */ | ||
355 | conf->spacing++; | ||
356 | } | ||
357 | |||
358 | /* calculate the max read-ahead size. | 331 | /* calculate the max read-ahead size. |
359 | * For read-ahead of large files to be effective, we need to | 332 | * For read-ahead of large files to be effective, we need to |
360 | * readahead at least twice a whole stripe. i.e. number of devices | 333 | * readahead at least twice a whole stripe. i.e. number of devices |
@@ -365,48 +338,107 @@ static int raid0_run (mddev_t *mddev) | |||
365 | * chunksize should be used in that case. | 338 | * chunksize should be used in that case. |
366 | */ | 339 | */ |
367 | { | 340 | { |
368 | int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE; | 341 | int stripe = mddev->raid_disks * |
342 | (mddev->chunk_sectors << 9) / PAGE_SIZE; | ||
369 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) | 343 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) |
370 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; | 344 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; |
371 | } | 345 | } |
372 | 346 | ||
373 | |||
374 | blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); | 347 | blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); |
348 | dump_zones(mddev); | ||
375 | return 0; | 349 | return 0; |
350 | } | ||
376 | 351 | ||
377 | out_free_conf: | 352 | static int raid0_stop(mddev_t *mddev) |
353 | { | ||
354 | raid0_conf_t *conf = mddev->private; | ||
355 | |||
356 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | ||
378 | kfree(conf->strip_zone); | 357 | kfree(conf->strip_zone); |
379 | kfree(conf->devlist); | 358 | kfree(conf->devlist); |
380 | kfree(conf); | 359 | kfree(conf); |
381 | mddev->private = NULL; | 360 | mddev->private = NULL; |
382 | out: | 361 | return 0; |
383 | return -ENOMEM; | ||
384 | } | 362 | } |
385 | 363 | ||
386 | static int raid0_stop (mddev_t *mddev) | 364 | /* Find the zone which holds a particular offset |
365 | * Update *sectorp to be an offset in that zone | ||
366 | */ | ||
367 | static struct strip_zone *find_zone(struct raid0_private_data *conf, | ||
368 | sector_t *sectorp) | ||
387 | { | 369 | { |
388 | raid0_conf_t *conf = mddev_to_conf(mddev); | 370 | int i; |
371 | struct strip_zone *z = conf->strip_zone; | ||
372 | sector_t sector = *sectorp; | ||
373 | |||
374 | for (i = 0; i < conf->nr_strip_zones; i++) | ||
375 | if (sector < z[i].zone_end) { | ||
376 | if (i) | ||
377 | *sectorp = sector - z[i-1].zone_end; | ||
378 | return z + i; | ||
379 | } | ||
380 | BUG(); | ||
381 | } | ||
389 | 382 | ||
390 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 383 | /* |
391 | kfree(conf->hash_table); | 384 | * remaps the bio to the target device. we separate two flows. |
392 | conf->hash_table = NULL; | 385 | * power 2 flow and a general flow for the sake of perfromance |
393 | kfree(conf->strip_zone); | 386 | */ |
394 | conf->strip_zone = NULL; | 387 | static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, |
395 | kfree(conf); | 388 | sector_t sector, sector_t *sector_offset) |
396 | mddev->private = NULL; | 389 | { |
390 | unsigned int sect_in_chunk; | ||
391 | sector_t chunk; | ||
392 | raid0_conf_t *conf = mddev->private; | ||
393 | unsigned int chunk_sects = mddev->chunk_sectors; | ||
394 | |||
395 | if (is_power_of_2(chunk_sects)) { | ||
396 | int chunksect_bits = ffz(~chunk_sects); | ||
397 | /* find the sector offset inside the chunk */ | ||
398 | sect_in_chunk = sector & (chunk_sects - 1); | ||
399 | sector >>= chunksect_bits; | ||
400 | /* chunk in zone */ | ||
401 | chunk = *sector_offset; | ||
402 | /* quotient is the chunk in real device*/ | ||
403 | sector_div(chunk, zone->nb_dev << chunksect_bits); | ||
404 | } else{ | ||
405 | sect_in_chunk = sector_div(sector, chunk_sects); | ||
406 | chunk = *sector_offset; | ||
407 | sector_div(chunk, chunk_sects * zone->nb_dev); | ||
408 | } | ||
409 | /* | ||
410 | * position the bio over the real device | ||
411 | * real sector = chunk in device + starting of zone | ||
412 | * + the position in the chunk | ||
413 | */ | ||
414 | *sector_offset = (chunk * chunk_sects) + sect_in_chunk; | ||
415 | return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks | ||
416 | + sector_div(sector, zone->nb_dev)]; | ||
417 | } | ||
397 | 418 | ||
398 | return 0; | 419 | /* |
420 | * Is io distribute over 1 or more chunks ? | ||
421 | */ | ||
422 | static inline int is_io_in_chunk_boundary(mddev_t *mddev, | ||
423 | unsigned int chunk_sects, struct bio *bio) | ||
424 | { | ||
425 | if (likely(is_power_of_2(chunk_sects))) { | ||
426 | return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) | ||
427 | + (bio->bi_size >> 9)); | ||
428 | } else{ | ||
429 | sector_t sector = bio->bi_sector; | ||
430 | return chunk_sects >= (sector_div(sector, chunk_sects) | ||
431 | + (bio->bi_size >> 9)); | ||
432 | } | ||
399 | } | 433 | } |
400 | 434 | ||
401 | static int raid0_make_request (struct request_queue *q, struct bio *bio) | 435 | static int raid0_make_request(struct request_queue *q, struct bio *bio) |
402 | { | 436 | { |
403 | mddev_t *mddev = q->queuedata; | 437 | mddev_t *mddev = q->queuedata; |
404 | unsigned int sect_in_chunk, chunksect_bits, chunk_sects; | 438 | unsigned int chunk_sects; |
405 | raid0_conf_t *conf = mddev_to_conf(mddev); | 439 | sector_t sector_offset; |
406 | struct strip_zone *zone; | 440 | struct strip_zone *zone; |
407 | mdk_rdev_t *tmp_dev; | 441 | mdk_rdev_t *tmp_dev; |
408 | sector_t chunk; | ||
409 | sector_t sector, rsect; | ||
410 | const int rw = bio_data_dir(bio); | 442 | const int rw = bio_data_dir(bio); |
411 | int cpu; | 443 | int cpu; |
412 | 444 | ||
@@ -421,11 +453,9 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) | |||
421 | bio_sectors(bio)); | 453 | bio_sectors(bio)); |
422 | part_stat_unlock(); | 454 | part_stat_unlock(); |
423 | 455 | ||
424 | chunk_sects = mddev->chunk_size >> 9; | 456 | chunk_sects = mddev->chunk_sectors; |
425 | chunksect_bits = ffz(~chunk_sects); | 457 | if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { |
426 | sector = bio->bi_sector; | 458 | sector_t sector = bio->bi_sector; |
427 | |||
428 | if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) { | ||
429 | struct bio_pair *bp; | 459 | struct bio_pair *bp; |
430 | /* Sanity check -- queue functions should prevent this happening */ | 460 | /* Sanity check -- queue functions should prevent this happening */ |
431 | if (bio->bi_vcnt != 1 || | 461 | if (bio->bi_vcnt != 1 || |
@@ -434,7 +464,12 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) | |||
434 | /* This is a one page bio that upper layers | 464 | /* This is a one page bio that upper layers |
435 | * refuse to split for us, so we need to split it. | 465 | * refuse to split for us, so we need to split it. |
436 | */ | 466 | */ |
437 | bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1))); | 467 | if (likely(is_power_of_2(chunk_sects))) |
468 | bp = bio_split(bio, chunk_sects - (sector & | ||
469 | (chunk_sects-1))); | ||
470 | else | ||
471 | bp = bio_split(bio, chunk_sects - | ||
472 | sector_div(sector, chunk_sects)); | ||
438 | if (raid0_make_request(q, &bp->bio1)) | 473 | if (raid0_make_request(q, &bp->bio1)) |
439 | generic_make_request(&bp->bio1); | 474 | generic_make_request(&bp->bio1); |
440 | if (raid0_make_request(q, &bp->bio2)) | 475 | if (raid0_make_request(q, &bp->bio2)) |
@@ -443,34 +478,14 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) | |||
443 | bio_pair_release(bp); | 478 | bio_pair_release(bp); |
444 | return 0; | 479 | return 0; |
445 | } | 480 | } |
446 | |||
447 | |||
448 | { | ||
449 | sector_t x = sector >> conf->sector_shift; | ||
450 | sector_div(x, (u32)conf->spacing); | ||
451 | zone = conf->hash_table[x]; | ||
452 | } | ||
453 | 481 | ||
454 | while (sector >= zone->zone_start + zone->sectors) | 482 | sector_offset = bio->bi_sector; |
455 | zone++; | 483 | zone = find_zone(mddev->private, §or_offset); |
456 | 484 | tmp_dev = map_sector(mddev, zone, bio->bi_sector, | |
457 | sect_in_chunk = bio->bi_sector & (chunk_sects - 1); | 485 | §or_offset); |
458 | |||
459 | |||
460 | { | ||
461 | sector_t x = (sector - zone->zone_start) >> chunksect_bits; | ||
462 | |||
463 | sector_div(x, zone->nb_dev); | ||
464 | chunk = x; | ||
465 | |||
466 | x = sector >> chunksect_bits; | ||
467 | tmp_dev = zone->dev[sector_div(x, zone->nb_dev)]; | ||
468 | } | ||
469 | rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk; | ||
470 | |||
471 | bio->bi_bdev = tmp_dev->bdev; | 486 | bio->bi_bdev = tmp_dev->bdev; |
472 | bio->bi_sector = rsect + tmp_dev->data_offset; | 487 | bio->bi_sector = sector_offset + zone->dev_start + |
473 | 488 | tmp_dev->data_offset; | |
474 | /* | 489 | /* |
475 | * Let the main block layer submit the IO and resolve recursion: | 490 | * Let the main block layer submit the IO and resolve recursion: |
476 | */ | 491 | */ |
@@ -485,31 +500,35 @@ bad_map: | |||
485 | return 0; | 500 | return 0; |
486 | } | 501 | } |
487 | 502 | ||
488 | static void raid0_status (struct seq_file *seq, mddev_t *mddev) | 503 | static void raid0_status(struct seq_file *seq, mddev_t *mddev) |
489 | { | 504 | { |
490 | #undef MD_DEBUG | 505 | #undef MD_DEBUG |
491 | #ifdef MD_DEBUG | 506 | #ifdef MD_DEBUG |
492 | int j, k, h; | 507 | int j, k, h; |
493 | char b[BDEVNAME_SIZE]; | 508 | char b[BDEVNAME_SIZE]; |
494 | raid0_conf_t *conf = mddev_to_conf(mddev); | 509 | raid0_conf_t *conf = mddev->private; |
495 | 510 | ||
511 | sector_t zone_size; | ||
512 | sector_t zone_start = 0; | ||
496 | h = 0; | 513 | h = 0; |
514 | |||
497 | for (j = 0; j < conf->nr_strip_zones; j++) { | 515 | for (j = 0; j < conf->nr_strip_zones; j++) { |
498 | seq_printf(seq, " z%d", j); | 516 | seq_printf(seq, " z%d", j); |
499 | if (conf->hash_table[h] == conf->strip_zone+j) | ||
500 | seq_printf(seq, "(h%d)", h++); | ||
501 | seq_printf(seq, "=["); | 517 | seq_printf(seq, "=["); |
502 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) | 518 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) |
503 | seq_printf(seq, "%s/", bdevname( | 519 | seq_printf(seq, "%s/", bdevname( |
504 | conf->strip_zone[j].dev[k]->bdev,b)); | 520 | conf->devlist[j*mddev->raid_disks + k] |
505 | 521 | ->bdev, b)); | |
506 | seq_printf(seq, "] zs=%d ds=%d s=%d\n", | 522 | |
507 | conf->strip_zone[j].zone_start, | 523 | zone_size = conf->strip_zone[j].zone_end - zone_start; |
508 | conf->strip_zone[j].dev_start, | 524 | seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n", |
509 | conf->strip_zone[j].sectors); | 525 | (unsigned long long)zone_start>>1, |
526 | (unsigned long long)conf->strip_zone[j].dev_start>>1, | ||
527 | (unsigned long long)zone_size>>1); | ||
528 | zone_start = conf->strip_zone[j].zone_end; | ||
510 | } | 529 | } |
511 | #endif | 530 | #endif |
512 | seq_printf(seq, " %dk chunks", mddev->chunk_size/1024); | 531 | seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2); |
513 | return; | 532 | return; |
514 | } | 533 | } |
515 | 534 | ||
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 824b12eb1d4f..91f8e876ee64 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h | |||
@@ -3,26 +3,18 @@ | |||
3 | 3 | ||
4 | struct strip_zone | 4 | struct strip_zone |
5 | { | 5 | { |
6 | sector_t zone_start; /* Zone offset in md_dev (in sectors) */ | 6 | sector_t zone_end; /* Start of the next zone (in sectors) */ |
7 | sector_t dev_start; /* Zone offset in real dev (in sectors) */ | 7 | sector_t dev_start; /* Zone offset in real dev (in sectors) */ |
8 | sector_t sectors; /* Zone size in sectors */ | ||
9 | int nb_dev; /* # of devices attached to the zone */ | 8 | int nb_dev; /* # of devices attached to the zone */ |
10 | mdk_rdev_t **dev; /* Devices attached to the zone */ | ||
11 | }; | 9 | }; |
12 | 10 | ||
13 | struct raid0_private_data | 11 | struct raid0_private_data |
14 | { | 12 | { |
15 | struct strip_zone **hash_table; /* Table of indexes into strip_zone */ | ||
16 | struct strip_zone *strip_zone; | 13 | struct strip_zone *strip_zone; |
17 | mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ | 14 | mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ |
18 | int nr_strip_zones; | 15 | int nr_strip_zones; |
19 | |||
20 | sector_t spacing; | ||
21 | int sector_shift; /* shift this before divide by spacing */ | ||
22 | }; | 16 | }; |
23 | 17 | ||
24 | typedef struct raid0_private_data raid0_conf_t; | 18 | typedef struct raid0_private_data raid0_conf_t; |
25 | 19 | ||
26 | #define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) | ||
27 | |||
28 | #endif | 20 | #endif |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 36df9109cde1..89939a7aef57 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -182,7 +182,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) | |||
182 | 182 | ||
183 | static void free_r1bio(r1bio_t *r1_bio) | 183 | static void free_r1bio(r1bio_t *r1_bio) |
184 | { | 184 | { |
185 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 185 | conf_t *conf = r1_bio->mddev->private; |
186 | 186 | ||
187 | /* | 187 | /* |
188 | * Wake up any possible resync thread that waits for the device | 188 | * Wake up any possible resync thread that waits for the device |
@@ -196,7 +196,7 @@ static void free_r1bio(r1bio_t *r1_bio) | |||
196 | 196 | ||
197 | static void put_buf(r1bio_t *r1_bio) | 197 | static void put_buf(r1bio_t *r1_bio) |
198 | { | 198 | { |
199 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 199 | conf_t *conf = r1_bio->mddev->private; |
200 | int i; | 200 | int i; |
201 | 201 | ||
202 | for (i=0; i<conf->raid_disks; i++) { | 202 | for (i=0; i<conf->raid_disks; i++) { |
@@ -214,7 +214,7 @@ static void reschedule_retry(r1bio_t *r1_bio) | |||
214 | { | 214 | { |
215 | unsigned long flags; | 215 | unsigned long flags; |
216 | mddev_t *mddev = r1_bio->mddev; | 216 | mddev_t *mddev = r1_bio->mddev; |
217 | conf_t *conf = mddev_to_conf(mddev); | 217 | conf_t *conf = mddev->private; |
218 | 218 | ||
219 | spin_lock_irqsave(&conf->device_lock, flags); | 219 | spin_lock_irqsave(&conf->device_lock, flags); |
220 | list_add(&r1_bio->retry_list, &conf->retry_list); | 220 | list_add(&r1_bio->retry_list, &conf->retry_list); |
@@ -253,7 +253,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio) | |||
253 | */ | 253 | */ |
254 | static inline void update_head_pos(int disk, r1bio_t *r1_bio) | 254 | static inline void update_head_pos(int disk, r1bio_t *r1_bio) |
255 | { | 255 | { |
256 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 256 | conf_t *conf = r1_bio->mddev->private; |
257 | 257 | ||
258 | conf->mirrors[disk].head_position = | 258 | conf->mirrors[disk].head_position = |
259 | r1_bio->sector + (r1_bio->sectors); | 259 | r1_bio->sector + (r1_bio->sectors); |
@@ -264,7 +264,7 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
264 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 264 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
265 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 265 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
266 | int mirror; | 266 | int mirror; |
267 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 267 | conf_t *conf = r1_bio->mddev->private; |
268 | 268 | ||
269 | mirror = r1_bio->read_disk; | 269 | mirror = r1_bio->read_disk; |
270 | /* | 270 | /* |
@@ -309,7 +309,7 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
309 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 309 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
310 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 310 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
311 | int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); | 311 | int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); |
312 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 312 | conf_t *conf = r1_bio->mddev->private; |
313 | struct bio *to_put = NULL; | 313 | struct bio *to_put = NULL; |
314 | 314 | ||
315 | 315 | ||
@@ -541,7 +541,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
541 | 541 | ||
542 | static void unplug_slaves(mddev_t *mddev) | 542 | static void unplug_slaves(mddev_t *mddev) |
543 | { | 543 | { |
544 | conf_t *conf = mddev_to_conf(mddev); | 544 | conf_t *conf = mddev->private; |
545 | int i; | 545 | int i; |
546 | 546 | ||
547 | rcu_read_lock(); | 547 | rcu_read_lock(); |
@@ -573,7 +573,7 @@ static void raid1_unplug(struct request_queue *q) | |||
573 | static int raid1_congested(void *data, int bits) | 573 | static int raid1_congested(void *data, int bits) |
574 | { | 574 | { |
575 | mddev_t *mddev = data; | 575 | mddev_t *mddev = data; |
576 | conf_t *conf = mddev_to_conf(mddev); | 576 | conf_t *conf = mddev->private; |
577 | int i, ret = 0; | 577 | int i, ret = 0; |
578 | 578 | ||
579 | rcu_read_lock(); | 579 | rcu_read_lock(); |
@@ -772,7 +772,7 @@ do_sync_io: | |||
772 | static int make_request(struct request_queue *q, struct bio * bio) | 772 | static int make_request(struct request_queue *q, struct bio * bio) |
773 | { | 773 | { |
774 | mddev_t *mddev = q->queuedata; | 774 | mddev_t *mddev = q->queuedata; |
775 | conf_t *conf = mddev_to_conf(mddev); | 775 | conf_t *conf = mddev->private; |
776 | mirror_info_t *mirror; | 776 | mirror_info_t *mirror; |
777 | r1bio_t *r1_bio; | 777 | r1bio_t *r1_bio; |
778 | struct bio *read_bio; | 778 | struct bio *read_bio; |
@@ -991,7 +991,7 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
991 | 991 | ||
992 | static void status(struct seq_file *seq, mddev_t *mddev) | 992 | static void status(struct seq_file *seq, mddev_t *mddev) |
993 | { | 993 | { |
994 | conf_t *conf = mddev_to_conf(mddev); | 994 | conf_t *conf = mddev->private; |
995 | int i; | 995 | int i; |
996 | 996 | ||
997 | seq_printf(seq, " [%d/%d] [", conf->raid_disks, | 997 | seq_printf(seq, " [%d/%d] [", conf->raid_disks, |
@@ -1010,7 +1010,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) | |||
1010 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | 1010 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) |
1011 | { | 1011 | { |
1012 | char b[BDEVNAME_SIZE]; | 1012 | char b[BDEVNAME_SIZE]; |
1013 | conf_t *conf = mddev_to_conf(mddev); | 1013 | conf_t *conf = mddev->private; |
1014 | 1014 | ||
1015 | /* | 1015 | /* |
1016 | * If it is not operational, then we have already marked it as dead | 1016 | * If it is not operational, then we have already marked it as dead |
@@ -1130,7 +1130,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1130 | * a one page request is never in violation. | 1130 | * a one page request is never in violation. |
1131 | */ | 1131 | */ |
1132 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 1132 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && |
1133 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 1133 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
1134 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 1134 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
1135 | 1135 | ||
1136 | p->head_position = 0; | 1136 | p->head_position = 0; |
@@ -1214,7 +1214,7 @@ static void end_sync_write(struct bio *bio, int error) | |||
1214 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 1214 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
1215 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 1215 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
1216 | mddev_t *mddev = r1_bio->mddev; | 1216 | mddev_t *mddev = r1_bio->mddev; |
1217 | conf_t *conf = mddev_to_conf(mddev); | 1217 | conf_t *conf = mddev->private; |
1218 | int i; | 1218 | int i; |
1219 | int mirror=0; | 1219 | int mirror=0; |
1220 | 1220 | ||
@@ -1248,7 +1248,7 @@ static void end_sync_write(struct bio *bio, int error) | |||
1248 | 1248 | ||
1249 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | 1249 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) |
1250 | { | 1250 | { |
1251 | conf_t *conf = mddev_to_conf(mddev); | 1251 | conf_t *conf = mddev->private; |
1252 | int i; | 1252 | int i; |
1253 | int disks = conf->raid_disks; | 1253 | int disks = conf->raid_disks; |
1254 | struct bio *bio, *wbio; | 1254 | struct bio *bio, *wbio; |
@@ -1562,7 +1562,7 @@ static void raid1d(mddev_t *mddev) | |||
1562 | r1bio_t *r1_bio; | 1562 | r1bio_t *r1_bio; |
1563 | struct bio *bio; | 1563 | struct bio *bio; |
1564 | unsigned long flags; | 1564 | unsigned long flags; |
1565 | conf_t *conf = mddev_to_conf(mddev); | 1565 | conf_t *conf = mddev->private; |
1566 | struct list_head *head = &conf->retry_list; | 1566 | struct list_head *head = &conf->retry_list; |
1567 | int unplug=0; | 1567 | int unplug=0; |
1568 | mdk_rdev_t *rdev; | 1568 | mdk_rdev_t *rdev; |
@@ -1585,7 +1585,7 @@ static void raid1d(mddev_t *mddev) | |||
1585 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1585 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1586 | 1586 | ||
1587 | mddev = r1_bio->mddev; | 1587 | mddev = r1_bio->mddev; |
1588 | conf = mddev_to_conf(mddev); | 1588 | conf = mddev->private; |
1589 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { | 1589 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { |
1590 | sync_request_write(mddev, r1_bio); | 1590 | sync_request_write(mddev, r1_bio); |
1591 | unplug = 1; | 1591 | unplug = 1; |
@@ -1706,7 +1706,7 @@ static int init_resync(conf_t *conf) | |||
1706 | 1706 | ||
1707 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) | 1707 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) |
1708 | { | 1708 | { |
1709 | conf_t *conf = mddev_to_conf(mddev); | 1709 | conf_t *conf = mddev->private; |
1710 | r1bio_t *r1_bio; | 1710 | r1bio_t *r1_bio; |
1711 | struct bio *bio; | 1711 | struct bio *bio; |
1712 | sector_t max_sector, nr_sectors; | 1712 | sector_t max_sector, nr_sectors; |
@@ -1996,7 +1996,7 @@ static int run(mddev_t *mddev) | |||
1996 | * a one page request is never in violation. | 1996 | * a one page request is never in violation. |
1997 | */ | 1997 | */ |
1998 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 1998 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && |
1999 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 1999 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
2000 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 2000 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
2001 | 2001 | ||
2002 | disk->head_position = 0; | 2002 | disk->head_position = 0; |
@@ -2052,6 +2052,10 @@ static int run(mddev_t *mddev) | |||
2052 | goto out_free_conf; | 2052 | goto out_free_conf; |
2053 | } | 2053 | } |
2054 | 2054 | ||
2055 | if (mddev->recovery_cp != MaxSector) | ||
2056 | printk(KERN_NOTICE "raid1: %s is not clean" | ||
2057 | " -- starting background reconstruction\n", | ||
2058 | mdname(mddev)); | ||
2055 | printk(KERN_INFO | 2059 | printk(KERN_INFO |
2056 | "raid1: raid set %s active with %d out of %d mirrors\n", | 2060 | "raid1: raid set %s active with %d out of %d mirrors\n", |
2057 | mdname(mddev), mddev->raid_disks - mddev->degraded, | 2061 | mdname(mddev), mddev->raid_disks - mddev->degraded, |
@@ -2087,7 +2091,7 @@ out: | |||
2087 | 2091 | ||
2088 | static int stop(mddev_t *mddev) | 2092 | static int stop(mddev_t *mddev) |
2089 | { | 2093 | { |
2090 | conf_t *conf = mddev_to_conf(mddev); | 2094 | conf_t *conf = mddev->private; |
2091 | struct bitmap *bitmap = mddev->bitmap; | 2095 | struct bitmap *bitmap = mddev->bitmap; |
2092 | int behind_wait = 0; | 2096 | int behind_wait = 0; |
2093 | 2097 | ||
@@ -2155,16 +2159,16 @@ static int raid1_reshape(mddev_t *mddev) | |||
2155 | mempool_t *newpool, *oldpool; | 2159 | mempool_t *newpool, *oldpool; |
2156 | struct pool_info *newpoolinfo; | 2160 | struct pool_info *newpoolinfo; |
2157 | mirror_info_t *newmirrors; | 2161 | mirror_info_t *newmirrors; |
2158 | conf_t *conf = mddev_to_conf(mddev); | 2162 | conf_t *conf = mddev->private; |
2159 | int cnt, raid_disks; | 2163 | int cnt, raid_disks; |
2160 | unsigned long flags; | 2164 | unsigned long flags; |
2161 | int d, d2, err; | 2165 | int d, d2, err; |
2162 | 2166 | ||
2163 | /* Cannot change chunk_size, layout, or level */ | 2167 | /* Cannot change chunk_size, layout, or level */ |
2164 | if (mddev->chunk_size != mddev->new_chunk || | 2168 | if (mddev->chunk_sectors != mddev->new_chunk_sectors || |
2165 | mddev->layout != mddev->new_layout || | 2169 | mddev->layout != mddev->new_layout || |
2166 | mddev->level != mddev->new_level) { | 2170 | mddev->level != mddev->new_level) { |
2167 | mddev->new_chunk = mddev->chunk_size; | 2171 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
2168 | mddev->new_layout = mddev->layout; | 2172 | mddev->new_layout = mddev->layout; |
2169 | mddev->new_level = mddev->level; | 2173 | mddev->new_level = mddev->level; |
2170 | return -EINVAL; | 2174 | return -EINVAL; |
@@ -2252,7 +2256,7 @@ static int raid1_reshape(mddev_t *mddev) | |||
2252 | 2256 | ||
2253 | static void raid1_quiesce(mddev_t *mddev, int state) | 2257 | static void raid1_quiesce(mddev_t *mddev, int state) |
2254 | { | 2258 | { |
2255 | conf_t *conf = mddev_to_conf(mddev); | 2259 | conf_t *conf = mddev->private; |
2256 | 2260 | ||
2257 | switch(state) { | 2261 | switch(state) { |
2258 | case 1: | 2262 | case 1: |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 1620eea3d57c..e87b84deff68 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
@@ -64,12 +64,6 @@ struct r1_private_data_s { | |||
64 | typedef struct r1_private_data_s conf_t; | 64 | typedef struct r1_private_data_s conf_t; |
65 | 65 | ||
66 | /* | 66 | /* |
67 | * this is the only point in the RAID code where we violate | ||
68 | * C type safety. mddev->private is an 'opaque' pointer. | ||
69 | */ | ||
70 | #define mddev_to_conf(mddev) ((conf_t *) mddev->private) | ||
71 | |||
72 | /* | ||
73 | * this is our 'private' RAID1 bio. | 67 | * this is our 'private' RAID1 bio. |
74 | * | 68 | * |
75 | * it contains information about what kind of IO operations were started | 69 | * it contains information about what kind of IO operations were started |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 499620afb44b..ae12ceafe10c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -188,7 +188,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) | |||
188 | 188 | ||
189 | static void free_r10bio(r10bio_t *r10_bio) | 189 | static void free_r10bio(r10bio_t *r10_bio) |
190 | { | 190 | { |
191 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | 191 | conf_t *conf = r10_bio->mddev->private; |
192 | 192 | ||
193 | /* | 193 | /* |
194 | * Wake up any possible resync thread that waits for the device | 194 | * Wake up any possible resync thread that waits for the device |
@@ -202,7 +202,7 @@ static void free_r10bio(r10bio_t *r10_bio) | |||
202 | 202 | ||
203 | static void put_buf(r10bio_t *r10_bio) | 203 | static void put_buf(r10bio_t *r10_bio) |
204 | { | 204 | { |
205 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | 205 | conf_t *conf = r10_bio->mddev->private; |
206 | 206 | ||
207 | mempool_free(r10_bio, conf->r10buf_pool); | 207 | mempool_free(r10_bio, conf->r10buf_pool); |
208 | 208 | ||
@@ -213,7 +213,7 @@ static void reschedule_retry(r10bio_t *r10_bio) | |||
213 | { | 213 | { |
214 | unsigned long flags; | 214 | unsigned long flags; |
215 | mddev_t *mddev = r10_bio->mddev; | 215 | mddev_t *mddev = r10_bio->mddev; |
216 | conf_t *conf = mddev_to_conf(mddev); | 216 | conf_t *conf = mddev->private; |
217 | 217 | ||
218 | spin_lock_irqsave(&conf->device_lock, flags); | 218 | spin_lock_irqsave(&conf->device_lock, flags); |
219 | list_add(&r10_bio->retry_list, &conf->retry_list); | 219 | list_add(&r10_bio->retry_list, &conf->retry_list); |
@@ -245,7 +245,7 @@ static void raid_end_bio_io(r10bio_t *r10_bio) | |||
245 | */ | 245 | */ |
246 | static inline void update_head_pos(int slot, r10bio_t *r10_bio) | 246 | static inline void update_head_pos(int slot, r10bio_t *r10_bio) |
247 | { | 247 | { |
248 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | 248 | conf_t *conf = r10_bio->mddev->private; |
249 | 249 | ||
250 | conf->mirrors[r10_bio->devs[slot].devnum].head_position = | 250 | conf->mirrors[r10_bio->devs[slot].devnum].head_position = |
251 | r10_bio->devs[slot].addr + (r10_bio->sectors); | 251 | r10_bio->devs[slot].addr + (r10_bio->sectors); |
@@ -256,7 +256,7 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
256 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 256 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
257 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | 257 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); |
258 | int slot, dev; | 258 | int slot, dev; |
259 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | 259 | conf_t *conf = r10_bio->mddev->private; |
260 | 260 | ||
261 | 261 | ||
262 | slot = r10_bio->read_slot; | 262 | slot = r10_bio->read_slot; |
@@ -297,7 +297,7 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
297 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 297 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
298 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | 298 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); |
299 | int slot, dev; | 299 | int slot, dev; |
300 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | 300 | conf_t *conf = r10_bio->mddev->private; |
301 | 301 | ||
302 | for (slot = 0; slot < conf->copies; slot++) | 302 | for (slot = 0; slot < conf->copies; slot++) |
303 | if (r10_bio->devs[slot].bio == bio) | 303 | if (r10_bio->devs[slot].bio == bio) |
@@ -461,7 +461,7 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
461 | mddev_t *mddev = q->queuedata; | 461 | mddev_t *mddev = q->queuedata; |
462 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 462 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
463 | int max; | 463 | int max; |
464 | unsigned int chunk_sectors = mddev->chunk_size >> 9; | 464 | unsigned int chunk_sectors = mddev->chunk_sectors; |
465 | unsigned int bio_sectors = bvm->bi_size >> 9; | 465 | unsigned int bio_sectors = bvm->bi_size >> 9; |
466 | 466 | ||
467 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; | 467 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; |
@@ -596,7 +596,7 @@ rb_out: | |||
596 | 596 | ||
597 | static void unplug_slaves(mddev_t *mddev) | 597 | static void unplug_slaves(mddev_t *mddev) |
598 | { | 598 | { |
599 | conf_t *conf = mddev_to_conf(mddev); | 599 | conf_t *conf = mddev->private; |
600 | int i; | 600 | int i; |
601 | 601 | ||
602 | rcu_read_lock(); | 602 | rcu_read_lock(); |
@@ -628,7 +628,7 @@ static void raid10_unplug(struct request_queue *q) | |||
628 | static int raid10_congested(void *data, int bits) | 628 | static int raid10_congested(void *data, int bits) |
629 | { | 629 | { |
630 | mddev_t *mddev = data; | 630 | mddev_t *mddev = data; |
631 | conf_t *conf = mddev_to_conf(mddev); | 631 | conf_t *conf = mddev->private; |
632 | int i, ret = 0; | 632 | int i, ret = 0; |
633 | 633 | ||
634 | rcu_read_lock(); | 634 | rcu_read_lock(); |
@@ -788,7 +788,7 @@ static void unfreeze_array(conf_t *conf) | |||
788 | static int make_request(struct request_queue *q, struct bio * bio) | 788 | static int make_request(struct request_queue *q, struct bio * bio) |
789 | { | 789 | { |
790 | mddev_t *mddev = q->queuedata; | 790 | mddev_t *mddev = q->queuedata; |
791 | conf_t *conf = mddev_to_conf(mddev); | 791 | conf_t *conf = mddev->private; |
792 | mirror_info_t *mirror; | 792 | mirror_info_t *mirror; |
793 | r10bio_t *r10_bio; | 793 | r10bio_t *r10_bio; |
794 | struct bio *read_bio; | 794 | struct bio *read_bio; |
@@ -981,11 +981,11 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
981 | 981 | ||
982 | static void status(struct seq_file *seq, mddev_t *mddev) | 982 | static void status(struct seq_file *seq, mddev_t *mddev) |
983 | { | 983 | { |
984 | conf_t *conf = mddev_to_conf(mddev); | 984 | conf_t *conf = mddev->private; |
985 | int i; | 985 | int i; |
986 | 986 | ||
987 | if (conf->near_copies < conf->raid_disks) | 987 | if (conf->near_copies < conf->raid_disks) |
988 | seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); | 988 | seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); |
989 | if (conf->near_copies > 1) | 989 | if (conf->near_copies > 1) |
990 | seq_printf(seq, " %d near-copies", conf->near_copies); | 990 | seq_printf(seq, " %d near-copies", conf->near_copies); |
991 | if (conf->far_copies > 1) { | 991 | if (conf->far_copies > 1) { |
@@ -1006,7 +1006,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) | |||
1006 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | 1006 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) |
1007 | { | 1007 | { |
1008 | char b[BDEVNAME_SIZE]; | 1008 | char b[BDEVNAME_SIZE]; |
1009 | conf_t *conf = mddev_to_conf(mddev); | 1009 | conf_t *conf = mddev->private; |
1010 | 1010 | ||
1011 | /* | 1011 | /* |
1012 | * If it is not operational, then we have already marked it as dead | 1012 | * If it is not operational, then we have already marked it as dead |
@@ -1158,8 +1158,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1158 | * a one page request is never in violation. | 1158 | * a one page request is never in violation. |
1159 | */ | 1159 | */ |
1160 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 1160 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && |
1161 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 1161 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
1162 | mddev->queue->max_sectors = (PAGE_SIZE>>9); | 1162 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
1163 | 1163 | ||
1164 | p->head_position = 0; | 1164 | p->head_position = 0; |
1165 | rdev->raid_disk = mirror; | 1165 | rdev->raid_disk = mirror; |
@@ -1215,7 +1215,7 @@ abort: | |||
1215 | static void end_sync_read(struct bio *bio, int error) | 1215 | static void end_sync_read(struct bio *bio, int error) |
1216 | { | 1216 | { |
1217 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | 1217 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); |
1218 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | 1218 | conf_t *conf = r10_bio->mddev->private; |
1219 | int i,d; | 1219 | int i,d; |
1220 | 1220 | ||
1221 | for (i=0; i<conf->copies; i++) | 1221 | for (i=0; i<conf->copies; i++) |
@@ -1253,7 +1253,7 @@ static void end_sync_write(struct bio *bio, int error) | |||
1253 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 1253 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
1254 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | 1254 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); |
1255 | mddev_t *mddev = r10_bio->mddev; | 1255 | mddev_t *mddev = r10_bio->mddev; |
1256 | conf_t *conf = mddev_to_conf(mddev); | 1256 | conf_t *conf = mddev->private; |
1257 | int i,d; | 1257 | int i,d; |
1258 | 1258 | ||
1259 | for (i = 0; i < conf->copies; i++) | 1259 | for (i = 0; i < conf->copies; i++) |
@@ -1300,7 +1300,7 @@ static void end_sync_write(struct bio *bio, int error) | |||
1300 | */ | 1300 | */ |
1301 | static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) | 1301 | static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) |
1302 | { | 1302 | { |
1303 | conf_t *conf = mddev_to_conf(mddev); | 1303 | conf_t *conf = mddev->private; |
1304 | int i, first; | 1304 | int i, first; |
1305 | struct bio *tbio, *fbio; | 1305 | struct bio *tbio, *fbio; |
1306 | 1306 | ||
@@ -1400,7 +1400,7 @@ done: | |||
1400 | 1400 | ||
1401 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) | 1401 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) |
1402 | { | 1402 | { |
1403 | conf_t *conf = mddev_to_conf(mddev); | 1403 | conf_t *conf = mddev->private; |
1404 | int i, d; | 1404 | int i, d; |
1405 | struct bio *bio, *wbio; | 1405 | struct bio *bio, *wbio; |
1406 | 1406 | ||
@@ -1549,7 +1549,7 @@ static void raid10d(mddev_t *mddev) | |||
1549 | r10bio_t *r10_bio; | 1549 | r10bio_t *r10_bio; |
1550 | struct bio *bio; | 1550 | struct bio *bio; |
1551 | unsigned long flags; | 1551 | unsigned long flags; |
1552 | conf_t *conf = mddev_to_conf(mddev); | 1552 | conf_t *conf = mddev->private; |
1553 | struct list_head *head = &conf->retry_list; | 1553 | struct list_head *head = &conf->retry_list; |
1554 | int unplug=0; | 1554 | int unplug=0; |
1555 | mdk_rdev_t *rdev; | 1555 | mdk_rdev_t *rdev; |
@@ -1572,7 +1572,7 @@ static void raid10d(mddev_t *mddev) | |||
1572 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1572 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1573 | 1573 | ||
1574 | mddev = r10_bio->mddev; | 1574 | mddev = r10_bio->mddev; |
1575 | conf = mddev_to_conf(mddev); | 1575 | conf = mddev->private; |
1576 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) { | 1576 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) { |
1577 | sync_request_write(mddev, r10_bio); | 1577 | sync_request_write(mddev, r10_bio); |
1578 | unplug = 1; | 1578 | unplug = 1; |
@@ -1680,7 +1680,7 @@ static int init_resync(conf_t *conf) | |||
1680 | 1680 | ||
1681 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) | 1681 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) |
1682 | { | 1682 | { |
1683 | conf_t *conf = mddev_to_conf(mddev); | 1683 | conf_t *conf = mddev->private; |
1684 | r10bio_t *r10_bio; | 1684 | r10bio_t *r10_bio; |
1685 | struct bio *biolist = NULL, *bio; | 1685 | struct bio *biolist = NULL, *bio; |
1686 | sector_t max_sector, nr_sectors; | 1686 | sector_t max_sector, nr_sectors; |
@@ -2026,7 +2026,7 @@ static sector_t | |||
2026 | raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) | 2026 | raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) |
2027 | { | 2027 | { |
2028 | sector_t size; | 2028 | sector_t size; |
2029 | conf_t *conf = mddev_to_conf(mddev); | 2029 | conf_t *conf = mddev->private; |
2030 | 2030 | ||
2031 | if (!raid_disks) | 2031 | if (!raid_disks) |
2032 | raid_disks = mddev->raid_disks; | 2032 | raid_disks = mddev->raid_disks; |
@@ -2050,9 +2050,10 @@ static int run(mddev_t *mddev) | |||
2050 | int nc, fc, fo; | 2050 | int nc, fc, fo; |
2051 | sector_t stride, size; | 2051 | sector_t stride, size; |
2052 | 2052 | ||
2053 | if (mddev->chunk_size < PAGE_SIZE) { | 2053 | if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || |
2054 | !is_power_of_2(mddev->chunk_sectors)) { | ||
2054 | printk(KERN_ERR "md/raid10: chunk size must be " | 2055 | printk(KERN_ERR "md/raid10: chunk size must be " |
2055 | "at least PAGE_SIZE(%ld).\n", PAGE_SIZE); | 2056 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); |
2056 | return -EINVAL; | 2057 | return -EINVAL; |
2057 | } | 2058 | } |
2058 | 2059 | ||
@@ -2095,8 +2096,8 @@ static int run(mddev_t *mddev) | |||
2095 | conf->far_copies = fc; | 2096 | conf->far_copies = fc; |
2096 | conf->copies = nc*fc; | 2097 | conf->copies = nc*fc; |
2097 | conf->far_offset = fo; | 2098 | conf->far_offset = fo; |
2098 | conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; | 2099 | conf->chunk_mask = mddev->chunk_sectors - 1; |
2099 | conf->chunk_shift = ffz(~mddev->chunk_size) - 9; | 2100 | conf->chunk_shift = ffz(~mddev->chunk_sectors); |
2100 | size = mddev->dev_sectors >> conf->chunk_shift; | 2101 | size = mddev->dev_sectors >> conf->chunk_shift; |
2101 | sector_div(size, fc); | 2102 | sector_div(size, fc); |
2102 | size = size * conf->raid_disks; | 2103 | size = size * conf->raid_disks; |
@@ -2145,8 +2146,8 @@ static int run(mddev_t *mddev) | |||
2145 | * a one page request is never in violation. | 2146 | * a one page request is never in violation. |
2146 | */ | 2147 | */ |
2147 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 2148 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && |
2148 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 2149 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
2149 | mddev->queue->max_sectors = (PAGE_SIZE>>9); | 2150 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
2150 | 2151 | ||
2151 | disk->head_position = 0; | 2152 | disk->head_position = 0; |
2152 | } | 2153 | } |
@@ -2185,6 +2186,10 @@ static int run(mddev_t *mddev) | |||
2185 | goto out_free_conf; | 2186 | goto out_free_conf; |
2186 | } | 2187 | } |
2187 | 2188 | ||
2189 | if (mddev->recovery_cp != MaxSector) | ||
2190 | printk(KERN_NOTICE "raid10: %s is not clean" | ||
2191 | " -- starting background reconstruction\n", | ||
2192 | mdname(mddev)); | ||
2188 | printk(KERN_INFO | 2193 | printk(KERN_INFO |
2189 | "raid10: raid set %s active with %d out of %d devices\n", | 2194 | "raid10: raid set %s active with %d out of %d devices\n", |
2190 | mdname(mddev), mddev->raid_disks - mddev->degraded, | 2195 | mdname(mddev), mddev->raid_disks - mddev->degraded, |
@@ -2204,7 +2209,8 @@ static int run(mddev_t *mddev) | |||
2204 | * maybe... | 2209 | * maybe... |
2205 | */ | 2210 | */ |
2206 | { | 2211 | { |
2207 | int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE); | 2212 | int stripe = conf->raid_disks * |
2213 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); | ||
2208 | stripe /= conf->near_copies; | 2214 | stripe /= conf->near_copies; |
2209 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) | 2215 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) |
2210 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; | 2216 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; |
@@ -2227,7 +2233,7 @@ out: | |||
2227 | 2233 | ||
2228 | static int stop(mddev_t *mddev) | 2234 | static int stop(mddev_t *mddev) |
2229 | { | 2235 | { |
2230 | conf_t *conf = mddev_to_conf(mddev); | 2236 | conf_t *conf = mddev->private; |
2231 | 2237 | ||
2232 | raise_barrier(conf, 0); | 2238 | raise_barrier(conf, 0); |
2233 | lower_barrier(conf); | 2239 | lower_barrier(conf); |
@@ -2245,7 +2251,7 @@ static int stop(mddev_t *mddev) | |||
2245 | 2251 | ||
2246 | static void raid10_quiesce(mddev_t *mddev, int state) | 2252 | static void raid10_quiesce(mddev_t *mddev, int state) |
2247 | { | 2253 | { |
2248 | conf_t *conf = mddev_to_conf(mddev); | 2254 | conf_t *conf = mddev->private; |
2249 | 2255 | ||
2250 | switch(state) { | 2256 | switch(state) { |
2251 | case 1: | 2257 | case 1: |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 244dbe507a54..59cd1efb8d30 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -62,12 +62,6 @@ struct r10_private_data_s { | |||
62 | typedef struct r10_private_data_s conf_t; | 62 | typedef struct r10_private_data_s conf_t; |
63 | 63 | ||
64 | /* | 64 | /* |
65 | * this is the only point in the RAID code where we violate | ||
66 | * C type safety. mddev->private is an 'opaque' pointer. | ||
67 | */ | ||
68 | #define mddev_to_conf(mddev) ((conf_t *) mddev->private) | ||
69 | |||
70 | /* | ||
71 | * this is our 'private' RAID10 bio. | 65 | * this is our 'private' RAID10 bio. |
72 | * | 66 | * |
73 | * it contains information about what kind of IO operations were started | 67 | * it contains information about what kind of IO operations were started |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 54ef8d75541d..cac6f4d3a143 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -1617,8 +1617,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1617 | sector_t new_sector; | 1617 | sector_t new_sector; |
1618 | int algorithm = previous ? conf->prev_algo | 1618 | int algorithm = previous ? conf->prev_algo |
1619 | : conf->algorithm; | 1619 | : conf->algorithm; |
1620 | int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) | 1620 | int sectors_per_chunk = previous ? conf->prev_chunk_sectors |
1621 | : (conf->chunk_size >> 9); | 1621 | : conf->chunk_sectors; |
1622 | int raid_disks = previous ? conf->previous_raid_disks | 1622 | int raid_disks = previous ? conf->previous_raid_disks |
1623 | : conf->raid_disks; | 1623 | : conf->raid_disks; |
1624 | int data_disks = raid_disks - conf->max_degraded; | 1624 | int data_disks = raid_disks - conf->max_degraded; |
@@ -1823,8 +1823,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
1823 | int raid_disks = sh->disks; | 1823 | int raid_disks = sh->disks; |
1824 | int data_disks = raid_disks - conf->max_degraded; | 1824 | int data_disks = raid_disks - conf->max_degraded; |
1825 | sector_t new_sector = sh->sector, check; | 1825 | sector_t new_sector = sh->sector, check; |
1826 | int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) | 1826 | int sectors_per_chunk = previous ? conf->prev_chunk_sectors |
1827 | : (conf->chunk_size >> 9); | 1827 | : conf->chunk_sectors; |
1828 | int algorithm = previous ? conf->prev_algo | 1828 | int algorithm = previous ? conf->prev_algo |
1829 | : conf->algorithm; | 1829 | : conf->algorithm; |
1830 | sector_t stripe; | 1830 | sector_t stripe; |
@@ -2098,8 +2098,7 @@ static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, | |||
2098 | struct stripe_head *sh) | 2098 | struct stripe_head *sh) |
2099 | { | 2099 | { |
2100 | int sectors_per_chunk = | 2100 | int sectors_per_chunk = |
2101 | previous ? (conf->prev_chunk >> 9) | 2101 | previous ? conf->prev_chunk_sectors : conf->chunk_sectors; |
2102 | : (conf->chunk_size >> 9); | ||
2103 | int dd_idx; | 2102 | int dd_idx; |
2104 | int chunk_offset = sector_div(stripe, sectors_per_chunk); | 2103 | int chunk_offset = sector_div(stripe, sectors_per_chunk); |
2105 | int disks = previous ? conf->previous_raid_disks : conf->raid_disks; | 2104 | int disks = previous ? conf->previous_raid_disks : conf->raid_disks; |
@@ -3496,7 +3495,7 @@ static void activate_bit_delay(raid5_conf_t *conf) | |||
3496 | 3495 | ||
3497 | static void unplug_slaves(mddev_t *mddev) | 3496 | static void unplug_slaves(mddev_t *mddev) |
3498 | { | 3497 | { |
3499 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3498 | raid5_conf_t *conf = mddev->private; |
3500 | int i; | 3499 | int i; |
3501 | 3500 | ||
3502 | rcu_read_lock(); | 3501 | rcu_read_lock(); |
@@ -3520,7 +3519,7 @@ static void unplug_slaves(mddev_t *mddev) | |||
3520 | static void raid5_unplug_device(struct request_queue *q) | 3519 | static void raid5_unplug_device(struct request_queue *q) |
3521 | { | 3520 | { |
3522 | mddev_t *mddev = q->queuedata; | 3521 | mddev_t *mddev = q->queuedata; |
3523 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3522 | raid5_conf_t *conf = mddev->private; |
3524 | unsigned long flags; | 3523 | unsigned long flags; |
3525 | 3524 | ||
3526 | spin_lock_irqsave(&conf->device_lock, flags); | 3525 | spin_lock_irqsave(&conf->device_lock, flags); |
@@ -3539,7 +3538,7 @@ static void raid5_unplug_device(struct request_queue *q) | |||
3539 | static int raid5_congested(void *data, int bits) | 3538 | static int raid5_congested(void *data, int bits) |
3540 | { | 3539 | { |
3541 | mddev_t *mddev = data; | 3540 | mddev_t *mddev = data; |
3542 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3541 | raid5_conf_t *conf = mddev->private; |
3543 | 3542 | ||
3544 | /* No difference between reads and writes. Just check | 3543 | /* No difference between reads and writes. Just check |
3545 | * how busy the stripe_cache is | 3544 | * how busy the stripe_cache is |
@@ -3564,14 +3563,14 @@ static int raid5_mergeable_bvec(struct request_queue *q, | |||
3564 | mddev_t *mddev = q->queuedata; | 3563 | mddev_t *mddev = q->queuedata; |
3565 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 3564 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
3566 | int max; | 3565 | int max; |
3567 | unsigned int chunk_sectors = mddev->chunk_size >> 9; | 3566 | unsigned int chunk_sectors = mddev->chunk_sectors; |
3568 | unsigned int bio_sectors = bvm->bi_size >> 9; | 3567 | unsigned int bio_sectors = bvm->bi_size >> 9; |
3569 | 3568 | ||
3570 | if ((bvm->bi_rw & 1) == WRITE) | 3569 | if ((bvm->bi_rw & 1) == WRITE) |
3571 | return biovec->bv_len; /* always allow writes to be mergeable */ | 3570 | return biovec->bv_len; /* always allow writes to be mergeable */ |
3572 | 3571 | ||
3573 | if (mddev->new_chunk < mddev->chunk_size) | 3572 | if (mddev->new_chunk_sectors < mddev->chunk_sectors) |
3574 | chunk_sectors = mddev->new_chunk >> 9; | 3573 | chunk_sectors = mddev->new_chunk_sectors; |
3575 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; | 3574 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; |
3576 | if (max < 0) max = 0; | 3575 | if (max < 0) max = 0; |
3577 | if (max <= biovec->bv_len && bio_sectors == 0) | 3576 | if (max <= biovec->bv_len && bio_sectors == 0) |
@@ -3584,11 +3583,11 @@ static int raid5_mergeable_bvec(struct request_queue *q, | |||
3584 | static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) | 3583 | static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) |
3585 | { | 3584 | { |
3586 | sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); | 3585 | sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); |
3587 | unsigned int chunk_sectors = mddev->chunk_size >> 9; | 3586 | unsigned int chunk_sectors = mddev->chunk_sectors; |
3588 | unsigned int bio_sectors = bio->bi_size >> 9; | 3587 | unsigned int bio_sectors = bio->bi_size >> 9; |
3589 | 3588 | ||
3590 | if (mddev->new_chunk < mddev->chunk_size) | 3589 | if (mddev->new_chunk_sectors < mddev->chunk_sectors) |
3591 | chunk_sectors = mddev->new_chunk >> 9; | 3590 | chunk_sectors = mddev->new_chunk_sectors; |
3592 | return chunk_sectors >= | 3591 | return chunk_sectors >= |
3593 | ((sector & (chunk_sectors - 1)) + bio_sectors); | 3592 | ((sector & (chunk_sectors - 1)) + bio_sectors); |
3594 | } | 3593 | } |
@@ -3652,7 +3651,7 @@ static void raid5_align_endio(struct bio *bi, int error) | |||
3652 | bio_put(bi); | 3651 | bio_put(bi); |
3653 | 3652 | ||
3654 | mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; | 3653 | mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; |
3655 | conf = mddev_to_conf(mddev); | 3654 | conf = mddev->private; |
3656 | rdev = (void*)raid_bi->bi_next; | 3655 | rdev = (void*)raid_bi->bi_next; |
3657 | raid_bi->bi_next = NULL; | 3656 | raid_bi->bi_next = NULL; |
3658 | 3657 | ||
@@ -3675,10 +3674,10 @@ static int bio_fits_rdev(struct bio *bi) | |||
3675 | { | 3674 | { |
3676 | struct request_queue *q = bdev_get_queue(bi->bi_bdev); | 3675 | struct request_queue *q = bdev_get_queue(bi->bi_bdev); |
3677 | 3676 | ||
3678 | if ((bi->bi_size>>9) > q->max_sectors) | 3677 | if ((bi->bi_size>>9) > queue_max_sectors(q)) |
3679 | return 0; | 3678 | return 0; |
3680 | blk_recount_segments(q, bi); | 3679 | blk_recount_segments(q, bi); |
3681 | if (bi->bi_phys_segments > q->max_phys_segments) | 3680 | if (bi->bi_phys_segments > queue_max_phys_segments(q)) |
3682 | return 0; | 3681 | return 0; |
3683 | 3682 | ||
3684 | if (q->merge_bvec_fn) | 3683 | if (q->merge_bvec_fn) |
@@ -3694,7 +3693,7 @@ static int bio_fits_rdev(struct bio *bi) | |||
3694 | static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) | 3693 | static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) |
3695 | { | 3694 | { |
3696 | mddev_t *mddev = q->queuedata; | 3695 | mddev_t *mddev = q->queuedata; |
3697 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3696 | raid5_conf_t *conf = mddev->private; |
3698 | unsigned int dd_idx; | 3697 | unsigned int dd_idx; |
3699 | struct bio* align_bi; | 3698 | struct bio* align_bi; |
3700 | mdk_rdev_t *rdev; | 3699 | mdk_rdev_t *rdev; |
@@ -3811,7 +3810,7 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) | |||
3811 | static int make_request(struct request_queue *q, struct bio * bi) | 3810 | static int make_request(struct request_queue *q, struct bio * bi) |
3812 | { | 3811 | { |
3813 | mddev_t *mddev = q->queuedata; | 3812 | mddev_t *mddev = q->queuedata; |
3814 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3813 | raid5_conf_t *conf = mddev->private; |
3815 | int dd_idx; | 3814 | int dd_idx; |
3816 | sector_t new_sector; | 3815 | sector_t new_sector; |
3817 | sector_t logical_sector, last_sector; | 3816 | sector_t logical_sector, last_sector; |
@@ -3908,6 +3907,7 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3908 | spin_unlock_irq(&conf->device_lock); | 3907 | spin_unlock_irq(&conf->device_lock); |
3909 | if (must_retry) { | 3908 | if (must_retry) { |
3910 | release_stripe(sh); | 3909 | release_stripe(sh); |
3910 | schedule(); | ||
3911 | goto retry; | 3911 | goto retry; |
3912 | } | 3912 | } |
3913 | } | 3913 | } |
@@ -4003,10 +4003,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
4003 | * If old and new chunk sizes differ, we need to process the | 4003 | * If old and new chunk sizes differ, we need to process the |
4004 | * largest of these | 4004 | * largest of these |
4005 | */ | 4005 | */ |
4006 | if (mddev->new_chunk > mddev->chunk_size) | 4006 | if (mddev->new_chunk_sectors > mddev->chunk_sectors) |
4007 | reshape_sectors = mddev->new_chunk / 512; | 4007 | reshape_sectors = mddev->new_chunk_sectors; |
4008 | else | 4008 | else |
4009 | reshape_sectors = mddev->chunk_size / 512; | 4009 | reshape_sectors = mddev->chunk_sectors; |
4010 | 4010 | ||
4011 | /* we update the metadata when there is more than 3Meg | 4011 | /* we update the metadata when there is more than 3Meg |
4012 | * in the block range (that is rather arbitrary, should | 4012 | * in the block range (that is rather arbitrary, should |
@@ -4129,7 +4129,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
4129 | 1, &dd_idx, NULL); | 4129 | 1, &dd_idx, NULL); |
4130 | last_sector = | 4130 | last_sector = |
4131 | raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) | 4131 | raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) |
4132 | *(new_data_disks) - 1), | 4132 | * new_data_disks - 1), |
4133 | 1, &dd_idx, NULL); | 4133 | 1, &dd_idx, NULL); |
4134 | if (last_sector >= mddev->dev_sectors) | 4134 | if (last_sector >= mddev->dev_sectors) |
4135 | last_sector = mddev->dev_sectors - 1; | 4135 | last_sector = mddev->dev_sectors - 1; |
@@ -4158,7 +4158,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
4158 | wait_event(conf->wait_for_overlap, | 4158 | wait_event(conf->wait_for_overlap, |
4159 | atomic_read(&conf->reshape_stripes) == 0); | 4159 | atomic_read(&conf->reshape_stripes) == 0); |
4160 | mddev->reshape_position = conf->reshape_progress; | 4160 | mddev->reshape_position = conf->reshape_progress; |
4161 | mddev->curr_resync_completed = mddev->curr_resync; | 4161 | mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; |
4162 | conf->reshape_checkpoint = jiffies; | 4162 | conf->reshape_checkpoint = jiffies; |
4163 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4163 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
4164 | md_wakeup_thread(mddev->thread); | 4164 | md_wakeup_thread(mddev->thread); |
@@ -4371,7 +4371,7 @@ static void synchronize_stripe_processing(struct list_head *domain) | |||
4371 | static void raid5d(mddev_t *mddev) | 4371 | static void raid5d(mddev_t *mddev) |
4372 | { | 4372 | { |
4373 | struct stripe_head *sh; | 4373 | struct stripe_head *sh; |
4374 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4374 | raid5_conf_t *conf = mddev->private; |
4375 | int handled; | 4375 | int handled; |
4376 | LIST_HEAD(raid_domain); | 4376 | LIST_HEAD(raid_domain); |
4377 | 4377 | ||
@@ -4428,7 +4428,7 @@ static void raid5d(mddev_t *mddev) | |||
4428 | static ssize_t | 4428 | static ssize_t |
4429 | raid5_show_stripe_cache_size(mddev_t *mddev, char *page) | 4429 | raid5_show_stripe_cache_size(mddev_t *mddev, char *page) |
4430 | { | 4430 | { |
4431 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4431 | raid5_conf_t *conf = mddev->private; |
4432 | if (conf) | 4432 | if (conf) |
4433 | return sprintf(page, "%d\n", conf->max_nr_stripes); | 4433 | return sprintf(page, "%d\n", conf->max_nr_stripes); |
4434 | else | 4434 | else |
@@ -4438,7 +4438,7 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page) | |||
4438 | static ssize_t | 4438 | static ssize_t |
4439 | raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) | 4439 | raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) |
4440 | { | 4440 | { |
4441 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4441 | raid5_conf_t *conf = mddev->private; |
4442 | unsigned long new; | 4442 | unsigned long new; |
4443 | int err; | 4443 | int err; |
4444 | 4444 | ||
@@ -4476,7 +4476,7 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, | |||
4476 | static ssize_t | 4476 | static ssize_t |
4477 | raid5_show_preread_threshold(mddev_t *mddev, char *page) | 4477 | raid5_show_preread_threshold(mddev_t *mddev, char *page) |
4478 | { | 4478 | { |
4479 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4479 | raid5_conf_t *conf = mddev->private; |
4480 | if (conf) | 4480 | if (conf) |
4481 | return sprintf(page, "%d\n", conf->bypass_threshold); | 4481 | return sprintf(page, "%d\n", conf->bypass_threshold); |
4482 | else | 4482 | else |
@@ -4486,7 +4486,7 @@ raid5_show_preread_threshold(mddev_t *mddev, char *page) | |||
4486 | static ssize_t | 4486 | static ssize_t |
4487 | raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) | 4487 | raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) |
4488 | { | 4488 | { |
4489 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4489 | raid5_conf_t *conf = mddev->private; |
4490 | unsigned long new; | 4490 | unsigned long new; |
4491 | if (len >= PAGE_SIZE) | 4491 | if (len >= PAGE_SIZE) |
4492 | return -EINVAL; | 4492 | return -EINVAL; |
@@ -4510,7 +4510,7 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, | |||
4510 | static ssize_t | 4510 | static ssize_t |
4511 | stripe_cache_active_show(mddev_t *mddev, char *page) | 4511 | stripe_cache_active_show(mddev_t *mddev, char *page) |
4512 | { | 4512 | { |
4513 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4513 | raid5_conf_t *conf = mddev->private; |
4514 | if (conf) | 4514 | if (conf) |
4515 | return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); | 4515 | return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); |
4516 | else | 4516 | else |
@@ -4534,7 +4534,7 @@ static struct attribute_group raid5_attrs_group = { | |||
4534 | static sector_t | 4534 | static sector_t |
4535 | raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) | 4535 | raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) |
4536 | { | 4536 | { |
4537 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4537 | raid5_conf_t *conf = mddev->private; |
4538 | 4538 | ||
4539 | if (!sectors) | 4539 | if (!sectors) |
4540 | sectors = mddev->dev_sectors; | 4540 | sectors = mddev->dev_sectors; |
@@ -4546,8 +4546,8 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
4546 | raid_disks = conf->previous_raid_disks; | 4546 | raid_disks = conf->previous_raid_disks; |
4547 | } | 4547 | } |
4548 | 4548 | ||
4549 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); | 4549 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); |
4550 | sectors &= ~((sector_t)mddev->new_chunk/512 - 1); | 4550 | sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); |
4551 | return sectors * (raid_disks - conf->max_degraded); | 4551 | return sectors * (raid_disks - conf->max_degraded); |
4552 | } | 4552 | } |
4553 | 4553 | ||
@@ -4691,9 +4691,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4691 | return ERR_PTR(-EINVAL); | 4691 | return ERR_PTR(-EINVAL); |
4692 | } | 4692 | } |
4693 | 4693 | ||
4694 | if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) { | 4694 | if (!mddev->new_chunk_sectors || |
4695 | (mddev->new_chunk_sectors << 9) % PAGE_SIZE || | ||
4696 | !is_power_of_2(mddev->new_chunk_sectors)) { | ||
4695 | printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", | 4697 | printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", |
4696 | mddev->new_chunk, mdname(mddev)); | 4698 | mddev->new_chunk_sectors << 9, mdname(mddev)); |
4697 | return ERR_PTR(-EINVAL); | 4699 | return ERR_PTR(-EINVAL); |
4698 | } | 4700 | } |
4699 | 4701 | ||
@@ -4756,7 +4758,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4756 | conf->fullsync = 1; | 4758 | conf->fullsync = 1; |
4757 | } | 4759 | } |
4758 | 4760 | ||
4759 | conf->chunk_size = mddev->new_chunk; | 4761 | conf->chunk_sectors = mddev->new_chunk_sectors; |
4762 | conf->level = mddev->new_level; | ||
4760 | if (conf->level == 6) | 4763 | if (conf->level == 6) |
4761 | conf->max_degraded = 2; | 4764 | conf->max_degraded = 2; |
4762 | else | 4765 | else |
@@ -4765,7 +4768,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4765 | conf->max_nr_stripes = NR_STRIPES; | 4768 | conf->max_nr_stripes = NR_STRIPES; |
4766 | conf->reshape_progress = mddev->reshape_position; | 4769 | conf->reshape_progress = mddev->reshape_position; |
4767 | if (conf->reshape_progress != MaxSector) { | 4770 | if (conf->reshape_progress != MaxSector) { |
4768 | conf->prev_chunk = mddev->chunk_size; | 4771 | conf->prev_chunk_sectors = mddev->chunk_sectors; |
4769 | conf->prev_algo = mddev->layout; | 4772 | conf->prev_algo = mddev->layout; |
4770 | } | 4773 | } |
4771 | 4774 | ||
@@ -4803,6 +4806,10 @@ static int run(mddev_t *mddev) | |||
4803 | int working_disks = 0; | 4806 | int working_disks = 0; |
4804 | mdk_rdev_t *rdev; | 4807 | mdk_rdev_t *rdev; |
4805 | 4808 | ||
4809 | if (mddev->recovery_cp != MaxSector) | ||
4810 | printk(KERN_NOTICE "raid5: %s is not clean" | ||
4811 | " -- starting background reconstruction\n", | ||
4812 | mdname(mddev)); | ||
4806 | if (mddev->reshape_position != MaxSector) { | 4813 | if (mddev->reshape_position != MaxSector) { |
4807 | /* Check that we can continue the reshape. | 4814 | /* Check that we can continue the reshape. |
4808 | * Currently only disks can change, it must | 4815 | * Currently only disks can change, it must |
@@ -4825,7 +4832,7 @@ static int run(mddev_t *mddev) | |||
4825 | * geometry. | 4832 | * geometry. |
4826 | */ | 4833 | */ |
4827 | here_new = mddev->reshape_position; | 4834 | here_new = mddev->reshape_position; |
4828 | if (sector_div(here_new, (mddev->new_chunk>>9)* | 4835 | if (sector_div(here_new, mddev->new_chunk_sectors * |
4829 | (mddev->raid_disks - max_degraded))) { | 4836 | (mddev->raid_disks - max_degraded))) { |
4830 | printk(KERN_ERR "raid5: reshape_position not " | 4837 | printk(KERN_ERR "raid5: reshape_position not " |
4831 | "on a stripe boundary\n"); | 4838 | "on a stripe boundary\n"); |
@@ -4833,7 +4840,7 @@ static int run(mddev_t *mddev) | |||
4833 | } | 4840 | } |
4834 | /* here_new is the stripe we will write to */ | 4841 | /* here_new is the stripe we will write to */ |
4835 | here_old = mddev->reshape_position; | 4842 | here_old = mddev->reshape_position; |
4836 | sector_div(here_old, (mddev->chunk_size>>9)* | 4843 | sector_div(here_old, mddev->chunk_sectors * |
4837 | (old_disks-max_degraded)); | 4844 | (old_disks-max_degraded)); |
4838 | /* here_old is the first stripe that we might need to read | 4845 | /* here_old is the first stripe that we might need to read |
4839 | * from */ | 4846 | * from */ |
@@ -4848,7 +4855,7 @@ static int run(mddev_t *mddev) | |||
4848 | } else { | 4855 | } else { |
4849 | BUG_ON(mddev->level != mddev->new_level); | 4856 | BUG_ON(mddev->level != mddev->new_level); |
4850 | BUG_ON(mddev->layout != mddev->new_layout); | 4857 | BUG_ON(mddev->layout != mddev->new_layout); |
4851 | BUG_ON(mddev->chunk_size != mddev->new_chunk); | 4858 | BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); |
4852 | BUG_ON(mddev->delta_disks != 0); | 4859 | BUG_ON(mddev->delta_disks != 0); |
4853 | } | 4860 | } |
4854 | 4861 | ||
@@ -4882,7 +4889,7 @@ static int run(mddev_t *mddev) | |||
4882 | } | 4889 | } |
4883 | 4890 | ||
4884 | /* device size must be a multiple of chunk size */ | 4891 | /* device size must be a multiple of chunk size */ |
4885 | mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); | 4892 | mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); |
4886 | mddev->resync_max_sectors = mddev->dev_sectors; | 4893 | mddev->resync_max_sectors = mddev->dev_sectors; |
4887 | 4894 | ||
4888 | if (mddev->degraded > 0 && | 4895 | if (mddev->degraded > 0 && |
@@ -4931,7 +4938,7 @@ static int run(mddev_t *mddev) | |||
4931 | { | 4938 | { |
4932 | int data_disks = conf->previous_raid_disks - conf->max_degraded; | 4939 | int data_disks = conf->previous_raid_disks - conf->max_degraded; |
4933 | int stripe = data_disks * | 4940 | int stripe = data_disks * |
4934 | (mddev->chunk_size / PAGE_SIZE); | 4941 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); |
4935 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | 4942 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
4936 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | 4943 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
4937 | } | 4944 | } |
@@ -5021,7 +5028,8 @@ static void status(struct seq_file *seq, mddev_t *mddev) | |||
5021 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | 5028 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; |
5022 | int i; | 5029 | int i; |
5023 | 5030 | ||
5024 | seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); | 5031 | seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, |
5032 | mddev->chunk_sectors / 2, mddev->layout); | ||
5025 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); | 5033 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); |
5026 | for (i = 0; i < conf->raid_disks; i++) | 5034 | for (i = 0; i < conf->raid_disks; i++) |
5027 | seq_printf (seq, "%s", | 5035 | seq_printf (seq, "%s", |
@@ -5169,7 +5177,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
5169 | * any io in the removed space completes, but it hardly seems | 5177 | * any io in the removed space completes, but it hardly seems |
5170 | * worth it. | 5178 | * worth it. |
5171 | */ | 5179 | */ |
5172 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); | 5180 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); |
5173 | md_set_array_sectors(mddev, raid5_size(mddev, sectors, | 5181 | md_set_array_sectors(mddev, raid5_size(mddev, sectors, |
5174 | mddev->raid_disks)); | 5182 | mddev->raid_disks)); |
5175 | if (mddev->array_sectors > | 5183 | if (mddev->array_sectors > |
@@ -5186,14 +5194,37 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
5186 | return 0; | 5194 | return 0; |
5187 | } | 5195 | } |
5188 | 5196 | ||
5189 | static int raid5_check_reshape(mddev_t *mddev) | 5197 | static int check_stripe_cache(mddev_t *mddev) |
5198 | { | ||
5199 | /* Can only proceed if there are plenty of stripe_heads. | ||
5200 | * We need a minimum of one full stripe,, and for sensible progress | ||
5201 | * it is best to have about 4 times that. | ||
5202 | * If we require 4 times, then the default 256 4K stripe_heads will | ||
5203 | * allow for chunk sizes up to 256K, which is probably OK. | ||
5204 | * If the chunk size is greater, user-space should request more | ||
5205 | * stripe_heads first. | ||
5206 | */ | ||
5207 | raid5_conf_t *conf = mddev->private; | ||
5208 | if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 | ||
5209 | > conf->max_nr_stripes || | ||
5210 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 | ||
5211 | > conf->max_nr_stripes) { | ||
5212 | printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", | ||
5213 | ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) | ||
5214 | / STRIPE_SIZE)*4); | ||
5215 | return 0; | ||
5216 | } | ||
5217 | return 1; | ||
5218 | } | ||
5219 | |||
5220 | static int check_reshape(mddev_t *mddev) | ||
5190 | { | 5221 | { |
5191 | raid5_conf_t *conf = mddev_to_conf(mddev); | 5222 | raid5_conf_t *conf = mddev->private; |
5192 | 5223 | ||
5193 | if (mddev->delta_disks == 0 && | 5224 | if (mddev->delta_disks == 0 && |
5194 | mddev->new_layout == mddev->layout && | 5225 | mddev->new_layout == mddev->layout && |
5195 | mddev->new_chunk == mddev->chunk_size) | 5226 | mddev->new_chunk_sectors == mddev->chunk_sectors) |
5196 | return -EINVAL; /* nothing to do */ | 5227 | return 0; /* nothing to do */ |
5197 | if (mddev->bitmap) | 5228 | if (mddev->bitmap) |
5198 | /* Cannot grow a bitmap yet */ | 5229 | /* Cannot grow a bitmap yet */ |
5199 | return -EBUSY; | 5230 | return -EBUSY; |
@@ -5212,28 +5243,15 @@ static int raid5_check_reshape(mddev_t *mddev) | |||
5212 | return -EINVAL; | 5243 | return -EINVAL; |
5213 | } | 5244 | } |
5214 | 5245 | ||
5215 | /* Can only proceed if there are plenty of stripe_heads. | 5246 | if (!check_stripe_cache(mddev)) |
5216 | * We need a minimum of one full stripe,, and for sensible progress | ||
5217 | * it is best to have about 4 times that. | ||
5218 | * If we require 4 times, then the default 256 4K stripe_heads will | ||
5219 | * allow for chunk sizes up to 256K, which is probably OK. | ||
5220 | * If the chunk size is greater, user-space should request more | ||
5221 | * stripe_heads first. | ||
5222 | */ | ||
5223 | if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || | ||
5224 | (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { | ||
5225 | printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", | ||
5226 | (max(mddev->chunk_size, mddev->new_chunk) | ||
5227 | / STRIPE_SIZE)*4); | ||
5228 | return -ENOSPC; | 5247 | return -ENOSPC; |
5229 | } | ||
5230 | 5248 | ||
5231 | return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); | 5249 | return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); |
5232 | } | 5250 | } |
5233 | 5251 | ||
5234 | static int raid5_start_reshape(mddev_t *mddev) | 5252 | static int raid5_start_reshape(mddev_t *mddev) |
5235 | { | 5253 | { |
5236 | raid5_conf_t *conf = mddev_to_conf(mddev); | 5254 | raid5_conf_t *conf = mddev->private; |
5237 | mdk_rdev_t *rdev; | 5255 | mdk_rdev_t *rdev; |
5238 | int spares = 0; | 5256 | int spares = 0; |
5239 | int added_devices = 0; | 5257 | int added_devices = 0; |
@@ -5242,6 +5260,9 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5242 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 5260 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
5243 | return -EBUSY; | 5261 | return -EBUSY; |
5244 | 5262 | ||
5263 | if (!check_stripe_cache(mddev)) | ||
5264 | return -ENOSPC; | ||
5265 | |||
5245 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5266 | list_for_each_entry(rdev, &mddev->disks, same_set) |
5246 | if (rdev->raid_disk < 0 && | 5267 | if (rdev->raid_disk < 0 && |
5247 | !test_bit(Faulty, &rdev->flags)) | 5268 | !test_bit(Faulty, &rdev->flags)) |
@@ -5268,8 +5289,8 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5268 | spin_lock_irq(&conf->device_lock); | 5289 | spin_lock_irq(&conf->device_lock); |
5269 | conf->previous_raid_disks = conf->raid_disks; | 5290 | conf->previous_raid_disks = conf->raid_disks; |
5270 | conf->raid_disks += mddev->delta_disks; | 5291 | conf->raid_disks += mddev->delta_disks; |
5271 | conf->prev_chunk = conf->chunk_size; | 5292 | conf->prev_chunk_sectors = conf->chunk_sectors; |
5272 | conf->chunk_size = mddev->new_chunk; | 5293 | conf->chunk_sectors = mddev->new_chunk_sectors; |
5273 | conf->prev_algo = conf->algorithm; | 5294 | conf->prev_algo = conf->algorithm; |
5274 | conf->algorithm = mddev->new_layout; | 5295 | conf->algorithm = mddev->new_layout; |
5275 | if (mddev->delta_disks < 0) | 5296 | if (mddev->delta_disks < 0) |
@@ -5351,7 +5372,7 @@ static void end_reshape(raid5_conf_t *conf) | |||
5351 | */ | 5372 | */ |
5352 | { | 5373 | { |
5353 | int data_disks = conf->raid_disks - conf->max_degraded; | 5374 | int data_disks = conf->raid_disks - conf->max_degraded; |
5354 | int stripe = data_disks * (conf->chunk_size | 5375 | int stripe = data_disks * ((conf->chunk_sectors << 9) |
5355 | / PAGE_SIZE); | 5376 | / PAGE_SIZE); |
5356 | if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | 5377 | if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
5357 | conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | 5378 | conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
@@ -5365,7 +5386,7 @@ static void end_reshape(raid5_conf_t *conf) | |||
5365 | static void raid5_finish_reshape(mddev_t *mddev) | 5386 | static void raid5_finish_reshape(mddev_t *mddev) |
5366 | { | 5387 | { |
5367 | struct block_device *bdev; | 5388 | struct block_device *bdev; |
5368 | raid5_conf_t *conf = mddev_to_conf(mddev); | 5389 | raid5_conf_t *conf = mddev->private; |
5369 | 5390 | ||
5370 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | 5391 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
5371 | 5392 | ||
@@ -5396,7 +5417,7 @@ static void raid5_finish_reshape(mddev_t *mddev) | |||
5396 | raid5_remove_disk(mddev, d); | 5417 | raid5_remove_disk(mddev, d); |
5397 | } | 5418 | } |
5398 | mddev->layout = conf->algorithm; | 5419 | mddev->layout = conf->algorithm; |
5399 | mddev->chunk_size = conf->chunk_size; | 5420 | mddev->chunk_sectors = conf->chunk_sectors; |
5400 | mddev->reshape_position = MaxSector; | 5421 | mddev->reshape_position = MaxSector; |
5401 | mddev->delta_disks = 0; | 5422 | mddev->delta_disks = 0; |
5402 | } | 5423 | } |
@@ -5404,7 +5425,7 @@ static void raid5_finish_reshape(mddev_t *mddev) | |||
5404 | 5425 | ||
5405 | static void raid5_quiesce(mddev_t *mddev, int state) | 5426 | static void raid5_quiesce(mddev_t *mddev, int state) |
5406 | { | 5427 | { |
5407 | raid5_conf_t *conf = mddev_to_conf(mddev); | 5428 | raid5_conf_t *conf = mddev->private; |
5408 | 5429 | ||
5409 | switch(state) { | 5430 | switch(state) { |
5410 | case 2: /* resume for a suspend */ | 5431 | case 2: /* resume for a suspend */ |
@@ -5454,7 +5475,7 @@ static void *raid5_takeover_raid1(mddev_t *mddev) | |||
5454 | 5475 | ||
5455 | mddev->new_level = 5; | 5476 | mddev->new_level = 5; |
5456 | mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; | 5477 | mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; |
5457 | mddev->new_chunk = chunksect << 9; | 5478 | mddev->new_chunk_sectors = chunksect; |
5458 | 5479 | ||
5459 | return setup_conf(mddev); | 5480 | return setup_conf(mddev); |
5460 | } | 5481 | } |
@@ -5493,24 +5514,24 @@ static void *raid5_takeover_raid6(mddev_t *mddev) | |||
5493 | } | 5514 | } |
5494 | 5515 | ||
5495 | 5516 | ||
5496 | static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) | 5517 | static int raid5_check_reshape(mddev_t *mddev) |
5497 | { | 5518 | { |
5498 | /* For a 2-drive array, the layout and chunk size can be changed | 5519 | /* For a 2-drive array, the layout and chunk size can be changed |
5499 | * immediately as not restriping is needed. | 5520 | * immediately as not restriping is needed. |
5500 | * For larger arrays we record the new value - after validation | 5521 | * For larger arrays we record the new value - after validation |
5501 | * to be used by a reshape pass. | 5522 | * to be used by a reshape pass. |
5502 | */ | 5523 | */ |
5503 | raid5_conf_t *conf = mddev_to_conf(mddev); | 5524 | raid5_conf_t *conf = mddev->private; |
5525 | int new_chunk = mddev->new_chunk_sectors; | ||
5504 | 5526 | ||
5505 | if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) | 5527 | if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) |
5506 | return -EINVAL; | 5528 | return -EINVAL; |
5507 | if (new_chunk > 0) { | 5529 | if (new_chunk > 0) { |
5508 | if (new_chunk & (new_chunk-1)) | 5530 | if (!is_power_of_2(new_chunk)) |
5509 | /* not a power of 2 */ | ||
5510 | return -EINVAL; | 5531 | return -EINVAL; |
5511 | if (new_chunk < PAGE_SIZE) | 5532 | if (new_chunk < (PAGE_SIZE>>9)) |
5512 | return -EINVAL; | 5533 | return -EINVAL; |
5513 | if (mddev->array_sectors & ((new_chunk>>9)-1)) | 5534 | if (mddev->array_sectors & (new_chunk-1)) |
5514 | /* not factor of array size */ | 5535 | /* not factor of array size */ |
5515 | return -EINVAL; | 5536 | return -EINVAL; |
5516 | } | 5537 | } |
@@ -5518,49 +5539,39 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) | |||
5518 | /* They look valid */ | 5539 | /* They look valid */ |
5519 | 5540 | ||
5520 | if (mddev->raid_disks == 2) { | 5541 | if (mddev->raid_disks == 2) { |
5521 | 5542 | /* can make the change immediately */ | |
5522 | if (new_layout >= 0) { | 5543 | if (mddev->new_layout >= 0) { |
5523 | conf->algorithm = new_layout; | 5544 | conf->algorithm = mddev->new_layout; |
5524 | mddev->layout = mddev->new_layout = new_layout; | 5545 | mddev->layout = mddev->new_layout; |
5525 | } | 5546 | } |
5526 | if (new_chunk > 0) { | 5547 | if (new_chunk > 0) { |
5527 | conf->chunk_size = new_chunk; | 5548 | conf->chunk_sectors = new_chunk ; |
5528 | mddev->chunk_size = mddev->new_chunk = new_chunk; | 5549 | mddev->chunk_sectors = new_chunk; |
5529 | } | 5550 | } |
5530 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 5551 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
5531 | md_wakeup_thread(mddev->thread); | 5552 | md_wakeup_thread(mddev->thread); |
5532 | } else { | ||
5533 | if (new_layout >= 0) | ||
5534 | mddev->new_layout = new_layout; | ||
5535 | if (new_chunk > 0) | ||
5536 | mddev->new_chunk = new_chunk; | ||
5537 | } | 5553 | } |
5538 | return 0; | 5554 | return check_reshape(mddev); |
5539 | } | 5555 | } |
5540 | 5556 | ||
5541 | static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk) | 5557 | static int raid6_check_reshape(mddev_t *mddev) |
5542 | { | 5558 | { |
5543 | if (new_layout >= 0 && !algorithm_valid_raid6(new_layout)) | 5559 | int new_chunk = mddev->new_chunk_sectors; |
5560 | |||
5561 | if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) | ||
5544 | return -EINVAL; | 5562 | return -EINVAL; |
5545 | if (new_chunk > 0) { | 5563 | if (new_chunk > 0) { |
5546 | if (new_chunk & (new_chunk-1)) | 5564 | if (!is_power_of_2(new_chunk)) |
5547 | /* not a power of 2 */ | ||
5548 | return -EINVAL; | 5565 | return -EINVAL; |
5549 | if (new_chunk < PAGE_SIZE) | 5566 | if (new_chunk < (PAGE_SIZE >> 9)) |
5550 | return -EINVAL; | 5567 | return -EINVAL; |
5551 | if (mddev->array_sectors & ((new_chunk>>9)-1)) | 5568 | if (mddev->array_sectors & (new_chunk-1)) |
5552 | /* not factor of array size */ | 5569 | /* not factor of array size */ |
5553 | return -EINVAL; | 5570 | return -EINVAL; |
5554 | } | 5571 | } |
5555 | 5572 | ||
5556 | /* They look valid */ | 5573 | /* They look valid */ |
5557 | 5574 | return check_reshape(mddev); | |
5558 | if (new_layout >= 0) | ||
5559 | mddev->new_layout = new_layout; | ||
5560 | if (new_chunk > 0) | ||
5561 | mddev->new_chunk = new_chunk; | ||
5562 | |||
5563 | return 0; | ||
5564 | } | 5575 | } |
5565 | 5576 | ||
5566 | static void *raid5_takeover(mddev_t *mddev) | 5577 | static void *raid5_takeover(mddev_t *mddev) |
@@ -5570,8 +5581,6 @@ static void *raid5_takeover(mddev_t *mddev) | |||
5570 | * raid1 - if there are two drives. We need to know the chunk size | 5581 | * raid1 - if there are two drives. We need to know the chunk size |
5571 | * raid4 - trivial - just use a raid4 layout. | 5582 | * raid4 - trivial - just use a raid4 layout. |
5572 | * raid6 - Providing it is a *_6 layout | 5583 | * raid6 - Providing it is a *_6 layout |
5573 | * | ||
5574 | * For now, just do raid1 | ||
5575 | */ | 5584 | */ |
5576 | 5585 | ||
5577 | if (mddev->level == 1) | 5586 | if (mddev->level == 1) |
@@ -5653,12 +5662,11 @@ static struct mdk_personality raid6_personality = | |||
5653 | .sync_request = sync_request, | 5662 | .sync_request = sync_request, |
5654 | .resize = raid5_resize, | 5663 | .resize = raid5_resize, |
5655 | .size = raid5_size, | 5664 | .size = raid5_size, |
5656 | .check_reshape = raid5_check_reshape, | 5665 | .check_reshape = raid6_check_reshape, |
5657 | .start_reshape = raid5_start_reshape, | 5666 | .start_reshape = raid5_start_reshape, |
5658 | .finish_reshape = raid5_finish_reshape, | 5667 | .finish_reshape = raid5_finish_reshape, |
5659 | .quiesce = raid5_quiesce, | 5668 | .quiesce = raid5_quiesce, |
5660 | .takeover = raid6_takeover, | 5669 | .takeover = raid6_takeover, |
5661 | .reconfig = raid6_reconfig, | ||
5662 | }; | 5670 | }; |
5663 | static struct mdk_personality raid5_personality = | 5671 | static struct mdk_personality raid5_personality = |
5664 | { | 5672 | { |
@@ -5681,7 +5689,6 @@ static struct mdk_personality raid5_personality = | |||
5681 | .finish_reshape = raid5_finish_reshape, | 5689 | .finish_reshape = raid5_finish_reshape, |
5682 | .quiesce = raid5_quiesce, | 5690 | .quiesce = raid5_quiesce, |
5683 | .takeover = raid5_takeover, | 5691 | .takeover = raid5_takeover, |
5684 | .reconfig = raid5_reconfig, | ||
5685 | }; | 5692 | }; |
5686 | 5693 | ||
5687 | static struct mdk_personality raid4_personality = | 5694 | static struct mdk_personality raid4_personality = |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 116d0b44b2a9..2390e0e83daf 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -337,7 +337,8 @@ struct raid5_private_data { | |||
337 | struct hlist_head *stripe_hashtbl; | 337 | struct hlist_head *stripe_hashtbl; |
338 | mddev_t *mddev; | 338 | mddev_t *mddev; |
339 | struct disk_info *spare; | 339 | struct disk_info *spare; |
340 | int chunk_size, level, algorithm; | 340 | int chunk_sectors; |
341 | int level, algorithm; | ||
341 | int max_degraded; | 342 | int max_degraded; |
342 | int raid_disks; | 343 | int raid_disks; |
343 | int max_nr_stripes; | 344 | int max_nr_stripes; |
@@ -353,7 +354,8 @@ struct raid5_private_data { | |||
353 | */ | 354 | */ |
354 | sector_t reshape_safe; | 355 | sector_t reshape_safe; |
355 | int previous_raid_disks; | 356 | int previous_raid_disks; |
356 | int prev_chunk, prev_algo; | 357 | int prev_chunk_sectors; |
358 | int prev_algo; | ||
357 | short generation; /* increments with every reshape */ | 359 | short generation; /* increments with every reshape */ |
358 | unsigned long reshape_checkpoint; /* Time we last updated | 360 | unsigned long reshape_checkpoint; /* Time we last updated |
359 | * metadata */ | 361 | * metadata */ |
@@ -424,8 +426,6 @@ struct raid5_private_data { | |||
424 | 426 | ||
425 | typedef struct raid5_private_data raid5_conf_t; | 427 | typedef struct raid5_private_data raid5_conf_t; |
426 | 428 | ||
427 | #define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) | ||
428 | |||
429 | /* | 429 | /* |
430 | * Our supported algorithms | 430 | * Our supported algorithms |
431 | */ | 431 | */ |