diff options
| author | Dan Williams <dan.j.williams@intel.com> | 2009-09-08 20:55:21 -0400 |
|---|---|---|
| committer | Dan Williams <dan.j.williams@intel.com> | 2009-09-08 20:55:21 -0400 |
| commit | bbb20089a3275a19e475dbc21320c3742e3ca423 (patch) | |
| tree | 216fdc1cbef450ca688135c5b8969169482d9a48 /drivers/md | |
| parent | 3e48e656903e9fd8bc805c6a2c4264d7808d315b (diff) | |
| parent | 657a77fa7284d8ae28dfa48f1dc5d919bf5b2843 (diff) | |
Merge branch 'dmaengine' into async-tx-next
Conflicts:
crypto/async_tx/async_xor.c
drivers/dma/ioat/dma_v2.h
drivers/dma/ioat/pci.c
drivers/md/raid5.c
Diffstat (limited to 'drivers/md')
43 files changed, 3987 insertions, 1098 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 09c0c6e49ab5..2158377a1359 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
| @@ -257,6 +257,17 @@ config DM_MIRROR | |||
| 257 | Allow volume managers to mirror logical volumes, also | 257 | Allow volume managers to mirror logical volumes, also |
| 258 | needed for live data migration tools such as 'pvmove'. | 258 | needed for live data migration tools such as 'pvmove'. |
| 259 | 259 | ||
| 260 | config DM_LOG_USERSPACE | ||
| 261 | tristate "Mirror userspace logging (EXPERIMENTAL)" | ||
| 262 | depends on DM_MIRROR && EXPERIMENTAL && NET | ||
| 263 | select CONNECTOR | ||
| 264 | ---help--- | ||
| 265 | The userspace logging module provides a mechanism for | ||
| 266 | relaying the dm-dirty-log API to userspace. Log designs | ||
| 267 | which are more suited to userspace implementation (e.g. | ||
| 268 | shared storage logs) or experimental logs can be implemented | ||
| 269 | by leveraging this framework. | ||
| 270 | |||
| 260 | config DM_ZERO | 271 | config DM_ZERO |
| 261 | tristate "Zero target" | 272 | tristate "Zero target" |
| 262 | depends on BLK_DEV_DM | 273 | depends on BLK_DEV_DM |
| @@ -275,6 +286,25 @@ config DM_MULTIPATH | |||
| 275 | ---help--- | 286 | ---help--- |
| 276 | Allow volume managers to support multipath hardware. | 287 | Allow volume managers to support multipath hardware. |
| 277 | 288 | ||
| 289 | config DM_MULTIPATH_QL | ||
| 290 | tristate "I/O Path Selector based on the number of in-flight I/Os" | ||
| 291 | depends on DM_MULTIPATH | ||
| 292 | ---help--- | ||
| 293 | This path selector is a dynamic load balancer which selects | ||
| 294 | the path with the least number of in-flight I/Os. | ||
| 295 | |||
| 296 | If unsure, say N. | ||
| 297 | |||
| 298 | config DM_MULTIPATH_ST | ||
| 299 | tristate "I/O Path Selector based on the service time" | ||
| 300 | depends on DM_MULTIPATH | ||
| 301 | ---help--- | ||
| 302 | This path selector is a dynamic load balancer which selects | ||
| 303 | the path expected to complete the incoming I/O in the shortest | ||
| 304 | time. | ||
| 305 | |||
| 306 | If unsure, say N. | ||
| 307 | |||
| 278 | config DM_DELAY | 308 | config DM_DELAY |
| 279 | tristate "I/O delaying target (EXPERIMENTAL)" | 309 | tristate "I/O delaying target (EXPERIMENTAL)" |
| 280 | depends on BLK_DEV_DM && EXPERIMENTAL | 310 | depends on BLK_DEV_DM && EXPERIMENTAL |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 45cc5951d928..1dc4185bd781 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
| @@ -8,6 +8,8 @@ dm-multipath-y += dm-path-selector.o dm-mpath.o | |||
| 8 | dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ | 8 | dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ |
| 9 | dm-snap-persistent.o | 9 | dm-snap-persistent.o |
| 10 | dm-mirror-y += dm-raid1.o | 10 | dm-mirror-y += dm-raid1.o |
| 11 | dm-log-userspace-y \ | ||
| 12 | += dm-log-userspace-base.o dm-log-userspace-transfer.o | ||
| 11 | md-mod-y += md.o bitmap.o | 13 | md-mod-y += md.o bitmap.o |
| 12 | raid456-y += raid5.o | 14 | raid456-y += raid5.o |
| 13 | raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ | 15 | raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ |
| @@ -36,8 +38,11 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | |||
| 36 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o | 38 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o |
| 37 | obj-$(CONFIG_DM_DELAY) += dm-delay.o | 39 | obj-$(CONFIG_DM_DELAY) += dm-delay.o |
| 38 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o | 40 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o |
| 41 | obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o | ||
| 42 | obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o | ||
| 39 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | 43 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o |
| 40 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o | 44 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o |
| 45 | obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o | ||
| 41 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | 46 | obj-$(CONFIG_DM_ZERO) += dm-zero.o |
| 42 | 47 | ||
| 43 | quiet_cmd_unroll = UNROLL $@ | 48 | quiet_cmd_unroll = UNROLL $@ |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 56df1cee8fb3..3319c2fec28e 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
| @@ -232,7 +232,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, | |||
| 232 | target = rdev->sb_start + offset + index * (PAGE_SIZE/512); | 232 | target = rdev->sb_start + offset + index * (PAGE_SIZE/512); |
| 233 | 233 | ||
| 234 | if (sync_page_io(rdev->bdev, target, | 234 | if (sync_page_io(rdev->bdev, target, |
| 235 | roundup(size, bdev_hardsect_size(rdev->bdev)), | 235 | roundup(size, bdev_logical_block_size(rdev->bdev)), |
| 236 | page, READ)) { | 236 | page, READ)) { |
| 237 | page->index = index; | 237 | page->index = index; |
| 238 | attach_page_buffers(page, NULL); /* so that free_buffer will | 238 | attach_page_buffers(page, NULL); /* so that free_buffer will |
| @@ -287,7 +287,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
| 287 | int size = PAGE_SIZE; | 287 | int size = PAGE_SIZE; |
| 288 | if (page->index == bitmap->file_pages-1) | 288 | if (page->index == bitmap->file_pages-1) |
| 289 | size = roundup(bitmap->last_page_size, | 289 | size = roundup(bitmap->last_page_size, |
| 290 | bdev_hardsect_size(rdev->bdev)); | 290 | bdev_logical_block_size(rdev->bdev)); |
| 291 | /* Just make sure we aren't corrupting data or | 291 | /* Just make sure we aren't corrupting data or |
| 292 | * metadata | 292 | * metadata |
| 293 | */ | 293 | */ |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 53394e863c74..9933eb861c71 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
| @@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1132 | goto bad_crypt_queue; | 1132 | goto bad_crypt_queue; |
| 1133 | } | 1133 | } |
| 1134 | 1134 | ||
| 1135 | ti->num_flush_requests = 1; | ||
| 1135 | ti->private = cc; | 1136 | ti->private = cc; |
| 1136 | return 0; | 1137 | return 0; |
| 1137 | 1138 | ||
| @@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, | |||
| 1189 | union map_info *map_context) | 1190 | union map_info *map_context) |
| 1190 | { | 1191 | { |
| 1191 | struct dm_crypt_io *io; | 1192 | struct dm_crypt_io *io; |
| 1193 | struct crypt_config *cc; | ||
| 1194 | |||
| 1195 | if (unlikely(bio_empty_barrier(bio))) { | ||
| 1196 | cc = ti->private; | ||
| 1197 | bio->bi_bdev = cc->dev->bdev; | ||
| 1198 | return DM_MAPIO_REMAPPED; | ||
| 1199 | } | ||
| 1192 | 1200 | ||
| 1193 | io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); | 1201 | io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); |
| 1194 | 1202 | ||
| @@ -1305,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
| 1305 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 1313 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
| 1306 | } | 1314 | } |
| 1307 | 1315 | ||
| 1316 | static int crypt_iterate_devices(struct dm_target *ti, | ||
| 1317 | iterate_devices_callout_fn fn, void *data) | ||
| 1318 | { | ||
| 1319 | struct crypt_config *cc = ti->private; | ||
| 1320 | |||
| 1321 | return fn(ti, cc->dev, cc->start, data); | ||
| 1322 | } | ||
| 1323 | |||
| 1308 | static struct target_type crypt_target = { | 1324 | static struct target_type crypt_target = { |
| 1309 | .name = "crypt", | 1325 | .name = "crypt", |
| 1310 | .version= {1, 6, 0}, | 1326 | .version = {1, 7, 0}, |
| 1311 | .module = THIS_MODULE, | 1327 | .module = THIS_MODULE, |
| 1312 | .ctr = crypt_ctr, | 1328 | .ctr = crypt_ctr, |
| 1313 | .dtr = crypt_dtr, | 1329 | .dtr = crypt_dtr, |
| @@ -1318,6 +1334,7 @@ static struct target_type crypt_target = { | |||
| 1318 | .resume = crypt_resume, | 1334 | .resume = crypt_resume, |
| 1319 | .message = crypt_message, | 1335 | .message = crypt_message, |
| 1320 | .merge = crypt_merge, | 1336 | .merge = crypt_merge, |
| 1337 | .iterate_devices = crypt_iterate_devices, | ||
| 1321 | }; | 1338 | }; |
| 1322 | 1339 | ||
| 1323 | static int __init dm_crypt_init(void) | 1340 | static int __init dm_crypt_init(void) |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 559dbb52bc85..4e5b843cd4d7 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
| @@ -197,6 +197,7 @@ out: | |||
| 197 | mutex_init(&dc->timer_lock); | 197 | mutex_init(&dc->timer_lock); |
| 198 | atomic_set(&dc->may_delay, 1); | 198 | atomic_set(&dc->may_delay, 1); |
| 199 | 199 | ||
| 200 | ti->num_flush_requests = 1; | ||
| 200 | ti->private = dc; | 201 | ti->private = dc; |
| 201 | return 0; | 202 | return 0; |
| 202 | 203 | ||
| @@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio, | |||
| 278 | 279 | ||
| 279 | if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { | 280 | if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { |
| 280 | bio->bi_bdev = dc->dev_write->bdev; | 281 | bio->bi_bdev = dc->dev_write->bdev; |
| 281 | bio->bi_sector = dc->start_write + | 282 | if (bio_sectors(bio)) |
| 282 | (bio->bi_sector - ti->begin); | 283 | bio->bi_sector = dc->start_write + |
| 284 | (bio->bi_sector - ti->begin); | ||
| 283 | 285 | ||
| 284 | return delay_bio(dc, dc->write_delay, bio); | 286 | return delay_bio(dc, dc->write_delay, bio); |
| 285 | } | 287 | } |
| @@ -316,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type, | |||
| 316 | return 0; | 318 | return 0; |
| 317 | } | 319 | } |
| 318 | 320 | ||
| 321 | static int delay_iterate_devices(struct dm_target *ti, | ||
| 322 | iterate_devices_callout_fn fn, void *data) | ||
| 323 | { | ||
| 324 | struct delay_c *dc = ti->private; | ||
| 325 | int ret = 0; | ||
| 326 | |||
| 327 | ret = fn(ti, dc->dev_read, dc->start_read, data); | ||
| 328 | if (ret) | ||
| 329 | goto out; | ||
| 330 | |||
| 331 | if (dc->dev_write) | ||
| 332 | ret = fn(ti, dc->dev_write, dc->start_write, data); | ||
| 333 | |||
| 334 | out: | ||
| 335 | return ret; | ||
| 336 | } | ||
| 337 | |||
| 319 | static struct target_type delay_target = { | 338 | static struct target_type delay_target = { |
| 320 | .name = "delay", | 339 | .name = "delay", |
| 321 | .version = {1, 0, 2}, | 340 | .version = {1, 1, 0}, |
| 322 | .module = THIS_MODULE, | 341 | .module = THIS_MODULE, |
| 323 | .ctr = delay_ctr, | 342 | .ctr = delay_ctr, |
| 324 | .dtr = delay_dtr, | 343 | .dtr = delay_dtr, |
| @@ -326,6 +345,7 @@ static struct target_type delay_target = { | |||
| 326 | .presuspend = delay_presuspend, | 345 | .presuspend = delay_presuspend, |
| 327 | .resume = delay_resume, | 346 | .resume = delay_resume, |
| 328 | .status = delay_status, | 347 | .status = delay_status, |
| 348 | .iterate_devices = delay_iterate_devices, | ||
| 329 | }; | 349 | }; |
| 330 | 350 | ||
| 331 | static int __init dm_delay_init(void) | 351 | static int __init dm_delay_init(void) |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index a2e26c242141..c3ae51584b12 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
| @@ -178,7 +178,7 @@ static int set_chunk_size(struct dm_exception_store *store, | |||
| 178 | } | 178 | } |
| 179 | 179 | ||
| 180 | /* Validate the chunk size against the device block size */ | 180 | /* Validate the chunk size against the device block size */ |
| 181 | if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) { | 181 | if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) { |
| 182 | *error = "Chunk size is not a multiple of device blocksize"; | 182 | *error = "Chunk size is not a multiple of device blocksize"; |
| 183 | return -EINVAL; | 183 | return -EINVAL; |
| 184 | } | 184 | } |
| @@ -216,7 +216,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
| 216 | return -EINVAL; | 216 | return -EINVAL; |
| 217 | } | 217 | } |
| 218 | 218 | ||
| 219 | type = get_type(argv[1]); | 219 | type = get_type(&persistent); |
| 220 | if (!type) { | 220 | if (!type) { |
| 221 | ti->error = "Exception store type not recognised"; | 221 | ti->error = "Exception store type not recognised"; |
| 222 | r = -EINVAL; | 222 | r = -EINVAL; |
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 0a2e6e7f67b3..2442c8c07898 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h | |||
| @@ -111,7 +111,7 @@ struct dm_exception_store { | |||
| 111 | /* | 111 | /* |
| 112 | * Funtions to manipulate consecutive chunks | 112 | * Funtions to manipulate consecutive chunks |
| 113 | */ | 113 | */ |
| 114 | # if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) | 114 | # if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) |
| 115 | # define DM_CHUNK_CONSECUTIVE_BITS 8 | 115 | # define DM_CHUNK_CONSECUTIVE_BITS 8 |
| 116 | # define DM_CHUNK_NUMBER_BITS 56 | 116 | # define DM_CHUNK_NUMBER_BITS 56 |
| 117 | 117 | ||
| @@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) | |||
| 156 | */ | 156 | */ |
| 157 | static inline sector_t get_dev_size(struct block_device *bdev) | 157 | static inline sector_t get_dev_size(struct block_device *bdev) |
| 158 | { | 158 | { |
| 159 | return bdev->bd_inode->i_size >> SECTOR_SHIFT; | 159 | return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; |
| 160 | } | 160 | } |
| 161 | 161 | ||
| 162 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, | 162 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, |
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index e73aabd61cd7..3a2e6a2f8bdd 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c | |||
| @@ -22,6 +22,7 @@ struct dm_io_client { | |||
| 22 | /* FIXME: can we shrink this ? */ | 22 | /* FIXME: can we shrink this ? */ |
| 23 | struct io { | 23 | struct io { |
| 24 | unsigned long error_bits; | 24 | unsigned long error_bits; |
| 25 | unsigned long eopnotsupp_bits; | ||
| 25 | atomic_t count; | 26 | atomic_t count; |
| 26 | struct task_struct *sleeper; | 27 | struct task_struct *sleeper; |
| 27 | struct dm_io_client *client; | 28 | struct dm_io_client *client; |
| @@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio) | |||
| 107 | *---------------------------------------------------------------*/ | 108 | *---------------------------------------------------------------*/ |
| 108 | static void dec_count(struct io *io, unsigned int region, int error) | 109 | static void dec_count(struct io *io, unsigned int region, int error) |
| 109 | { | 110 | { |
| 110 | if (error) | 111 | if (error) { |
| 111 | set_bit(region, &io->error_bits); | 112 | set_bit(region, &io->error_bits); |
| 113 | if (error == -EOPNOTSUPP) | ||
| 114 | set_bit(region, &io->eopnotsupp_bits); | ||
| 115 | } | ||
| 112 | 116 | ||
| 113 | if (atomic_dec_and_test(&io->count)) { | 117 | if (atomic_dec_and_test(&io->count)) { |
| 114 | if (io->sleeper) | 118 | if (io->sleeper) |
| @@ -360,7 +364,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
| 360 | return -EIO; | 364 | return -EIO; |
| 361 | } | 365 | } |
| 362 | 366 | ||
| 367 | retry: | ||
| 363 | io.error_bits = 0; | 368 | io.error_bits = 0; |
| 369 | io.eopnotsupp_bits = 0; | ||
| 364 | atomic_set(&io.count, 1); /* see dispatch_io() */ | 370 | atomic_set(&io.count, 1); /* see dispatch_io() */ |
| 365 | io.sleeper = current; | 371 | io.sleeper = current; |
| 366 | io.client = client; | 372 | io.client = client; |
| @@ -377,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
| 377 | } | 383 | } |
| 378 | set_current_state(TASK_RUNNING); | 384 | set_current_state(TASK_RUNNING); |
| 379 | 385 | ||
| 386 | if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { | ||
| 387 | rw &= ~(1 << BIO_RW_BARRIER); | ||
| 388 | goto retry; | ||
| 389 | } | ||
| 390 | |||
| 380 | if (error_bits) | 391 | if (error_bits) |
| 381 | *error_bits = io.error_bits; | 392 | *error_bits = io.error_bits; |
| 382 | 393 | ||
| @@ -397,6 +408,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, | |||
| 397 | 408 | ||
| 398 | io = mempool_alloc(client->pool, GFP_NOIO); | 409 | io = mempool_alloc(client->pool, GFP_NOIO); |
| 399 | io->error_bits = 0; | 410 | io->error_bits = 0; |
| 411 | io->eopnotsupp_bits = 0; | ||
| 400 | atomic_set(&io->count, 1); /* see dispatch_io() */ | 412 | atomic_set(&io->count, 1); /* see dispatch_io() */ |
| 401 | io->sleeper = NULL; | 413 | io->sleeper = NULL; |
| 402 | io->client = client; | 414 | io->client = client; |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 823ceba6efa8..7f77f18fcafa 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
| @@ -276,7 +276,7 @@ retry: | |||
| 276 | up_write(&_hash_lock); | 276 | up_write(&_hash_lock); |
| 277 | } | 277 | } |
| 278 | 278 | ||
| 279 | static int dm_hash_rename(const char *old, const char *new) | 279 | static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) |
| 280 | { | 280 | { |
| 281 | char *new_name, *old_name; | 281 | char *new_name, *old_name; |
| 282 | struct hash_cell *hc; | 282 | struct hash_cell *hc; |
| @@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new) | |||
| 333 | dm_table_put(table); | 333 | dm_table_put(table); |
| 334 | } | 334 | } |
| 335 | 335 | ||
| 336 | dm_kobject_uevent(hc->md); | 336 | dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie); |
| 337 | 337 | ||
| 338 | dm_put(hc->md); | 338 | dm_put(hc->md); |
| 339 | up_write(&_hash_lock); | 339 | up_write(&_hash_lock); |
| @@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) | |||
| 680 | 680 | ||
| 681 | __hash_remove(hc); | 681 | __hash_remove(hc); |
| 682 | up_write(&_hash_lock); | 682 | up_write(&_hash_lock); |
| 683 | |||
| 684 | dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr); | ||
| 685 | |||
| 683 | dm_put(md); | 686 | dm_put(md); |
| 684 | param->data_size = 0; | 687 | param->data_size = 0; |
| 685 | return 0; | 688 | return 0; |
| @@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) | |||
| 715 | return r; | 718 | return r; |
| 716 | 719 | ||
| 717 | param->data_size = 0; | 720 | param->data_size = 0; |
| 718 | return dm_hash_rename(param->name, new_name); | 721 | return dm_hash_rename(param->event_nr, param->name, new_name); |
| 719 | } | 722 | } |
| 720 | 723 | ||
| 721 | static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | 724 | static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) |
| @@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param) | |||
| 842 | if (dm_suspended(md)) | 845 | if (dm_suspended(md)) |
| 843 | r = dm_resume(md); | 846 | r = dm_resume(md); |
| 844 | 847 | ||
| 845 | if (!r) | 848 | |
| 849 | if (!r) { | ||
| 850 | dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); | ||
| 846 | r = __dev_status(md, param); | 851 | r = __dev_status(md, param); |
| 852 | } | ||
| 847 | 853 | ||
| 848 | dm_put(md); | 854 | dm_put(md); |
| 849 | return r; | 855 | return r; |
| @@ -1044,6 +1050,12 @@ static int populate_table(struct dm_table *table, | |||
| 1044 | next = spec->next; | 1050 | next = spec->next; |
| 1045 | } | 1051 | } |
| 1046 | 1052 | ||
| 1053 | r = dm_table_set_type(table); | ||
| 1054 | if (r) { | ||
| 1055 | DMWARN("unable to set table type"); | ||
| 1056 | return r; | ||
| 1057 | } | ||
| 1058 | |||
| 1047 | return dm_table_complete(table); | 1059 | return dm_table_complete(table); |
| 1048 | } | 1060 | } |
| 1049 | 1061 | ||
| @@ -1089,6 +1101,13 @@ static int table_load(struct dm_ioctl *param, size_t param_size) | |||
| 1089 | goto out; | 1101 | goto out; |
| 1090 | } | 1102 | } |
| 1091 | 1103 | ||
| 1104 | r = dm_table_alloc_md_mempools(t); | ||
| 1105 | if (r) { | ||
| 1106 | DMWARN("unable to allocate mempools for this table"); | ||
| 1107 | dm_table_destroy(t); | ||
| 1108 | goto out; | ||
| 1109 | } | ||
| 1110 | |||
| 1092 | down_write(&_hash_lock); | 1111 | down_write(&_hash_lock); |
| 1093 | hc = dm_get_mdptr(md); | 1112 | hc = dm_get_mdptr(md); |
| 1094 | if (!hc || hc->md != md) { | 1113 | if (!hc || hc->md != md) { |
| @@ -1513,6 +1532,7 @@ static const struct file_operations _ctl_fops = { | |||
| 1513 | static struct miscdevice _dm_misc = { | 1532 | static struct miscdevice _dm_misc = { |
| 1514 | .minor = MISC_DYNAMIC_MINOR, | 1533 | .minor = MISC_DYNAMIC_MINOR, |
| 1515 | .name = DM_NAME, | 1534 | .name = DM_NAME, |
| 1535 | .devnode = "mapper/control", | ||
| 1516 | .fops = &_ctl_fops | 1536 | .fops = &_ctl_fops |
| 1517 | }; | 1537 | }; |
| 1518 | 1538 | ||
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 79fb53e51c70..9184b6deb868 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
| @@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 53 | goto bad; | 53 | goto bad; |
| 54 | } | 54 | } |
| 55 | 55 | ||
| 56 | ti->num_flush_requests = 1; | ||
| 56 | ti->private = lc; | 57 | ti->private = lc; |
| 57 | return 0; | 58 | return 0; |
| 58 | 59 | ||
| @@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) | |||
| 81 | struct linear_c *lc = ti->private; | 82 | struct linear_c *lc = ti->private; |
| 82 | 83 | ||
| 83 | bio->bi_bdev = lc->dev->bdev; | 84 | bio->bi_bdev = lc->dev->bdev; |
| 84 | bio->bi_sector = linear_map_sector(ti, bio->bi_sector); | 85 | if (bio_sectors(bio)) |
| 86 | bio->bi_sector = linear_map_sector(ti, bio->bi_sector); | ||
| 85 | } | 87 | } |
| 86 | 88 | ||
| 87 | static int linear_map(struct dm_target *ti, struct bio *bio, | 89 | static int linear_map(struct dm_target *ti, struct bio *bio, |
| @@ -132,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
| 132 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 134 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
| 133 | } | 135 | } |
| 134 | 136 | ||
| 137 | static int linear_iterate_devices(struct dm_target *ti, | ||
| 138 | iterate_devices_callout_fn fn, void *data) | ||
| 139 | { | ||
| 140 | struct linear_c *lc = ti->private; | ||
| 141 | |||
| 142 | return fn(ti, lc->dev, lc->start, data); | ||
| 143 | } | ||
| 144 | |||
| 135 | static struct target_type linear_target = { | 145 | static struct target_type linear_target = { |
| 136 | .name = "linear", | 146 | .name = "linear", |
| 137 | .version= {1, 0, 3}, | 147 | .version = {1, 1, 0}, |
| 138 | .module = THIS_MODULE, | 148 | .module = THIS_MODULE, |
| 139 | .ctr = linear_ctr, | 149 | .ctr = linear_ctr, |
| 140 | .dtr = linear_dtr, | 150 | .dtr = linear_dtr, |
| @@ -142,6 +152,7 @@ static struct target_type linear_target = { | |||
| 142 | .status = linear_status, | 152 | .status = linear_status, |
| 143 | .ioctl = linear_ioctl, | 153 | .ioctl = linear_ioctl, |
| 144 | .merge = linear_merge, | 154 | .merge = linear_merge, |
| 155 | .iterate_devices = linear_iterate_devices, | ||
| 145 | }; | 156 | }; |
| 146 | 157 | ||
| 147 | int __init dm_linear_init(void) | 158 | int __init dm_linear_init(void) |
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c new file mode 100644 index 000000000000..e69b96560997 --- /dev/null +++ b/drivers/md/dm-log-userspace-base.c | |||
| @@ -0,0 +1,696 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the LGPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/bio.h> | ||
| 8 | #include <linux/dm-dirty-log.h> | ||
| 9 | #include <linux/device-mapper.h> | ||
| 10 | #include <linux/dm-log-userspace.h> | ||
| 11 | |||
| 12 | #include "dm-log-userspace-transfer.h" | ||
| 13 | |||
| 14 | struct flush_entry { | ||
| 15 | int type; | ||
| 16 | region_t region; | ||
| 17 | struct list_head list; | ||
| 18 | }; | ||
| 19 | |||
| 20 | struct log_c { | ||
| 21 | struct dm_target *ti; | ||
| 22 | uint32_t region_size; | ||
| 23 | region_t region_count; | ||
| 24 | char uuid[DM_UUID_LEN]; | ||
| 25 | |||
| 26 | char *usr_argv_str; | ||
| 27 | uint32_t usr_argc; | ||
| 28 | |||
| 29 | /* | ||
| 30 | * in_sync_hint gets set when doing is_remote_recovering. It | ||
| 31 | * represents the first region that needs recovery. IOW, the | ||
| 32 | * first zero bit of sync_bits. This can be useful for to limit | ||
| 33 | * traffic for calls like is_remote_recovering and get_resync_work, | ||
| 34 | * but be take care in its use for anything else. | ||
| 35 | */ | ||
| 36 | uint64_t in_sync_hint; | ||
| 37 | |||
| 38 | spinlock_t flush_lock; | ||
| 39 | struct list_head flush_list; /* only for clear and mark requests */ | ||
| 40 | }; | ||
| 41 | |||
| 42 | static mempool_t *flush_entry_pool; | ||
| 43 | |||
| 44 | static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) | ||
| 45 | { | ||
| 46 | return kmalloc(sizeof(struct flush_entry), gfp_mask); | ||
| 47 | } | ||
| 48 | |||
| 49 | static void flush_entry_free(void *element, void *pool_data) | ||
| 50 | { | ||
| 51 | kfree(element); | ||
| 52 | } | ||
| 53 | |||
| 54 | static int userspace_do_request(struct log_c *lc, const char *uuid, | ||
| 55 | int request_type, char *data, size_t data_size, | ||
| 56 | char *rdata, size_t *rdata_size) | ||
| 57 | { | ||
| 58 | int r; | ||
| 59 | |||
| 60 | /* | ||
| 61 | * If the server isn't there, -ESRCH is returned, | ||
| 62 | * and we must keep trying until the server is | ||
| 63 | * restored. | ||
| 64 | */ | ||
| 65 | retry: | ||
| 66 | r = dm_consult_userspace(uuid, request_type, data, | ||
| 67 | data_size, rdata, rdata_size); | ||
| 68 | |||
| 69 | if (r != -ESRCH) | ||
| 70 | return r; | ||
| 71 | |||
| 72 | DMERR(" Userspace log server not found."); | ||
| 73 | while (1) { | ||
| 74 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 75 | schedule_timeout(2*HZ); | ||
| 76 | DMWARN("Attempting to contact userspace log server..."); | ||
| 77 | r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str, | ||
| 78 | strlen(lc->usr_argv_str) + 1, | ||
| 79 | NULL, NULL); | ||
| 80 | if (!r) | ||
| 81 | break; | ||
| 82 | } | ||
| 83 | DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); | ||
| 84 | r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL, | ||
| 85 | 0, NULL, NULL); | ||
| 86 | if (!r) | ||
| 87 | goto retry; | ||
| 88 | |||
| 89 | DMERR("Error trying to resume userspace log: %d", r); | ||
| 90 | |||
| 91 | return -ESRCH; | ||
| 92 | } | ||
| 93 | |||
| 94 | static int build_constructor_string(struct dm_target *ti, | ||
| 95 | unsigned argc, char **argv, | ||
| 96 | char **ctr_str) | ||
| 97 | { | ||
| 98 | int i, str_size; | ||
| 99 | char *str = NULL; | ||
| 100 | |||
| 101 | *ctr_str = NULL; | ||
| 102 | |||
| 103 | for (i = 0, str_size = 0; i < argc; i++) | ||
| 104 | str_size += strlen(argv[i]) + 1; /* +1 for space between args */ | ||
| 105 | |||
| 106 | str_size += 20; /* Max number of chars in a printed u64 number */ | ||
| 107 | |||
| 108 | str = kzalloc(str_size, GFP_KERNEL); | ||
| 109 | if (!str) { | ||
| 110 | DMWARN("Unable to allocate memory for constructor string"); | ||
| 111 | return -ENOMEM; | ||
| 112 | } | ||
| 113 | |||
| 114 | for (i = 0, str_size = 0; i < argc; i++) | ||
| 115 | str_size += sprintf(str + str_size, "%s ", argv[i]); | ||
| 116 | str_size += sprintf(str + str_size, "%llu", | ||
| 117 | (unsigned long long)ti->len); | ||
| 118 | |||
| 119 | *ctr_str = str; | ||
| 120 | return str_size; | ||
| 121 | } | ||
| 122 | |||
| 123 | /* | ||
| 124 | * userspace_ctr | ||
| 125 | * | ||
| 126 | * argv contains: | ||
| 127 | * <UUID> <other args> | ||
| 128 | * Where 'other args' is the userspace implementation specific log | ||
| 129 | * arguments. An example might be: | ||
| 130 | * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync] | ||
| 131 | * | ||
| 132 | * So, this module will strip off the <UUID> for identification purposes | ||
| 133 | * when communicating with userspace about a log; but will pass on everything | ||
| 134 | * else. | ||
| 135 | */ | ||
| 136 | static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | ||
| 137 | unsigned argc, char **argv) | ||
| 138 | { | ||
| 139 | int r = 0; | ||
| 140 | int str_size; | ||
| 141 | char *ctr_str = NULL; | ||
| 142 | struct log_c *lc = NULL; | ||
| 143 | uint64_t rdata; | ||
| 144 | size_t rdata_size = sizeof(rdata); | ||
| 145 | |||
| 146 | if (argc < 3) { | ||
| 147 | DMWARN("Too few arguments to userspace dirty log"); | ||
| 148 | return -EINVAL; | ||
| 149 | } | ||
| 150 | |||
| 151 | lc = kmalloc(sizeof(*lc), GFP_KERNEL); | ||
| 152 | if (!lc) { | ||
| 153 | DMWARN("Unable to allocate userspace log context."); | ||
| 154 | return -ENOMEM; | ||
| 155 | } | ||
| 156 | |||
| 157 | lc->ti = ti; | ||
| 158 | |||
| 159 | if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { | ||
| 160 | DMWARN("UUID argument too long."); | ||
| 161 | kfree(lc); | ||
| 162 | return -EINVAL; | ||
| 163 | } | ||
| 164 | |||
| 165 | strncpy(lc->uuid, argv[0], DM_UUID_LEN); | ||
| 166 | spin_lock_init(&lc->flush_lock); | ||
| 167 | INIT_LIST_HEAD(&lc->flush_list); | ||
| 168 | |||
| 169 | str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); | ||
| 170 | if (str_size < 0) { | ||
| 171 | kfree(lc); | ||
| 172 | return str_size; | ||
| 173 | } | ||
| 174 | |||
| 175 | /* Send table string */ | ||
| 176 | r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR, | ||
| 177 | ctr_str, str_size, NULL, NULL); | ||
| 178 | |||
| 179 | if (r == -ESRCH) { | ||
| 180 | DMERR("Userspace log server not found"); | ||
| 181 | goto out; | ||
| 182 | } | ||
| 183 | |||
| 184 | /* Since the region size does not change, get it now */ | ||
| 185 | rdata_size = sizeof(rdata); | ||
| 186 | r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE, | ||
| 187 | NULL, 0, (char *)&rdata, &rdata_size); | ||
| 188 | |||
| 189 | if (r) { | ||
| 190 | DMERR("Failed to get region size of dirty log"); | ||
| 191 | goto out; | ||
| 192 | } | ||
| 193 | |||
| 194 | lc->region_size = (uint32_t)rdata; | ||
| 195 | lc->region_count = dm_sector_div_up(ti->len, lc->region_size); | ||
| 196 | |||
| 197 | out: | ||
| 198 | if (r) { | ||
| 199 | kfree(lc); | ||
| 200 | kfree(ctr_str); | ||
| 201 | } else { | ||
| 202 | lc->usr_argv_str = ctr_str; | ||
| 203 | lc->usr_argc = argc; | ||
| 204 | log->context = lc; | ||
| 205 | } | ||
| 206 | |||
| 207 | return r; | ||
| 208 | } | ||
| 209 | |||
| 210 | static void userspace_dtr(struct dm_dirty_log *log) | ||
| 211 | { | ||
| 212 | int r; | ||
| 213 | struct log_c *lc = log->context; | ||
| 214 | |||
| 215 | r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR, | ||
| 216 | NULL, 0, | ||
| 217 | NULL, NULL); | ||
| 218 | |||
| 219 | kfree(lc->usr_argv_str); | ||
| 220 | kfree(lc); | ||
| 221 | |||
| 222 | return; | ||
| 223 | } | ||
| 224 | |||
| 225 | static int userspace_presuspend(struct dm_dirty_log *log) | ||
| 226 | { | ||
| 227 | int r; | ||
| 228 | struct log_c *lc = log->context; | ||
| 229 | |||
| 230 | r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND, | ||
| 231 | NULL, 0, | ||
| 232 | NULL, NULL); | ||
| 233 | |||
| 234 | return r; | ||
| 235 | } | ||
| 236 | |||
| 237 | static int userspace_postsuspend(struct dm_dirty_log *log) | ||
| 238 | { | ||
| 239 | int r; | ||
| 240 | struct log_c *lc = log->context; | ||
| 241 | |||
| 242 | r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND, | ||
| 243 | NULL, 0, | ||
| 244 | NULL, NULL); | ||
| 245 | |||
| 246 | return r; | ||
| 247 | } | ||
| 248 | |||
| 249 | static int userspace_resume(struct dm_dirty_log *log) | ||
| 250 | { | ||
| 251 | int r; | ||
| 252 | struct log_c *lc = log->context; | ||
| 253 | |||
| 254 | lc->in_sync_hint = 0; | ||
| 255 | r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME, | ||
| 256 | NULL, 0, | ||
| 257 | NULL, NULL); | ||
| 258 | |||
| 259 | return r; | ||
| 260 | } | ||
| 261 | |||
| 262 | static uint32_t userspace_get_region_size(struct dm_dirty_log *log) | ||
| 263 | { | ||
| 264 | struct log_c *lc = log->context; | ||
| 265 | |||
| 266 | return lc->region_size; | ||
| 267 | } | ||
| 268 | |||
| 269 | /* | ||
| 270 | * userspace_is_clean | ||
| 271 | * | ||
| 272 | * Check whether a region is clean. If there is any sort of | ||
| 273 | * failure when consulting the server, we return not clean. | ||
| 274 | * | ||
| 275 | * Returns: 1 if clean, 0 otherwise | ||
| 276 | */ | ||
| 277 | static int userspace_is_clean(struct dm_dirty_log *log, region_t region) | ||
| 278 | { | ||
| 279 | int r; | ||
| 280 | uint64_t region64 = (uint64_t)region; | ||
| 281 | int64_t is_clean; | ||
| 282 | size_t rdata_size; | ||
| 283 | struct log_c *lc = log->context; | ||
| 284 | |||
| 285 | rdata_size = sizeof(is_clean); | ||
| 286 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, | ||
| 287 | (char *)®ion64, sizeof(region64), | ||
| 288 | (char *)&is_clean, &rdata_size); | ||
| 289 | |||
| 290 | return (r) ? 0 : (int)is_clean; | ||
| 291 | } | ||
| 292 | |||
| 293 | /* | ||
| 294 | * userspace_in_sync | ||
| 295 | * | ||
| 296 | * Check if the region is in-sync. If there is any sort | ||
| 297 | * of failure when consulting the server, we assume that | ||
| 298 | * the region is not in sync. | ||
| 299 | * | ||
| 300 | * If 'can_block' is set, return immediately | ||
| 301 | * | ||
| 302 | * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK | ||
| 303 | */ | ||
| 304 | static int userspace_in_sync(struct dm_dirty_log *log, region_t region, | ||
| 305 | int can_block) | ||
| 306 | { | ||
| 307 | int r; | ||
| 308 | uint64_t region64 = region; | ||
| 309 | int64_t in_sync; | ||
| 310 | size_t rdata_size; | ||
| 311 | struct log_c *lc = log->context; | ||
| 312 | |||
| 313 | /* | ||
| 314 | * We can never respond directly - even if in_sync_hint is | ||
| 315 | * set. This is because another machine could see a device | ||
| 316 | * failure and mark the region out-of-sync. If we don't go | ||
| 317 | * to userspace to ask, we might think the region is in-sync | ||
| 318 | * and allow a read to pick up data that is stale. (This is | ||
| 319 | * very unlikely if a device actually fails; but it is very | ||
| 320 | * likely if a connection to one device from one machine fails.) | ||
| 321 | * | ||
| 322 | * There still might be a problem if the mirror caches the region | ||
| 323 | * state as in-sync... but then this call would not be made. So, | ||
| 324 | * that is a mirror problem. | ||
| 325 | */ | ||
| 326 | if (!can_block) | ||
| 327 | return -EWOULDBLOCK; | ||
| 328 | |||
| 329 | rdata_size = sizeof(in_sync); | ||
| 330 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, | ||
| 331 | (char *)®ion64, sizeof(region64), | ||
| 332 | (char *)&in_sync, &rdata_size); | ||
| 333 | return (r) ? 0 : (int)in_sync; | ||
| 334 | } | ||
| 335 | |||
| 336 | /* | ||
| 337 | * userspace_flush | ||
| 338 | * | ||
| 339 | * This function is ok to block. | ||
| 340 | * The flush happens in two stages. First, it sends all | ||
| 341 | * clear/mark requests that are on the list. Then it | ||
| 342 | * tells the server to commit them. This gives the | ||
| 343 | * server a chance to optimise the commit, instead of | ||
| 344 | * doing it for every request. | ||
| 345 | * | ||
| 346 | * Additionally, we could implement another thread that | ||
| 347 | * sends the requests up to the server - reducing the | ||
| 348 | * load on flush. Then the flush would have less in | ||
| 349 | * the list and be responsible for the finishing commit. | ||
| 350 | * | ||
| 351 | * Returns: 0 on success, < 0 on failure | ||
| 352 | */ | ||
| 353 | static int userspace_flush(struct dm_dirty_log *log) | ||
| 354 | { | ||
| 355 | int r = 0; | ||
| 356 | unsigned long flags; | ||
| 357 | struct log_c *lc = log->context; | ||
| 358 | LIST_HEAD(flush_list); | ||
| 359 | struct flush_entry *fe, *tmp_fe; | ||
| 360 | |||
| 361 | spin_lock_irqsave(&lc->flush_lock, flags); | ||
| 362 | list_splice_init(&lc->flush_list, &flush_list); | ||
| 363 | spin_unlock_irqrestore(&lc->flush_lock, flags); | ||
| 364 | |||
| 365 | if (list_empty(&flush_list)) | ||
| 366 | return 0; | ||
| 367 | |||
| 368 | /* | ||
| 369 | * FIXME: Count up requests, group request types, | ||
| 370 | * allocate memory to stick all requests in and | ||
| 371 | * send to server in one go. Failing the allocation, | ||
| 372 | * do it one by one. | ||
| 373 | */ | ||
| 374 | |||
| 375 | list_for_each_entry(fe, &flush_list, list) { | ||
| 376 | r = userspace_do_request(lc, lc->uuid, fe->type, | ||
| 377 | (char *)&fe->region, | ||
| 378 | sizeof(fe->region), | ||
| 379 | NULL, NULL); | ||
| 380 | if (r) | ||
| 381 | goto fail; | ||
| 382 | } | ||
| 383 | |||
| 384 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, | ||
| 385 | NULL, 0, NULL, NULL); | ||
| 386 | |||
| 387 | fail: | ||
| 388 | /* | ||
| 389 | * We can safely remove these entries, even if failure. | ||
| 390 | * Calling code will receive an error and will know that | ||
| 391 | * the log facility has failed. | ||
| 392 | */ | ||
| 393 | list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { | ||
| 394 | list_del(&fe->list); | ||
| 395 | mempool_free(fe, flush_entry_pool); | ||
| 396 | } | ||
| 397 | |||
| 398 | if (r) | ||
| 399 | dm_table_event(lc->ti->table); | ||
| 400 | |||
| 401 | return r; | ||
| 402 | } | ||
| 403 | |||
| 404 | /* | ||
| 405 | * userspace_mark_region | ||
| 406 | * | ||
| 407 | * This function should avoid blocking unless absolutely required. | ||
| 408 | * (Memory allocation is valid for blocking.) | ||
| 409 | */ | ||
| 410 | static void userspace_mark_region(struct dm_dirty_log *log, region_t region) | ||
| 411 | { | ||
| 412 | unsigned long flags; | ||
| 413 | struct log_c *lc = log->context; | ||
| 414 | struct flush_entry *fe; | ||
| 415 | |||
| 416 | /* Wait for an allocation, but _never_ fail */ | ||
| 417 | fe = mempool_alloc(flush_entry_pool, GFP_NOIO); | ||
| 418 | BUG_ON(!fe); | ||
| 419 | |||
| 420 | spin_lock_irqsave(&lc->flush_lock, flags); | ||
| 421 | fe->type = DM_ULOG_MARK_REGION; | ||
| 422 | fe->region = region; | ||
| 423 | list_add(&fe->list, &lc->flush_list); | ||
| 424 | spin_unlock_irqrestore(&lc->flush_lock, flags); | ||
| 425 | |||
| 426 | return; | ||
| 427 | } | ||
| 428 | |||
| 429 | /* | ||
| 430 | * userspace_clear_region | ||
| 431 | * | ||
| 432 | * This function must not block. | ||
| 433 | * So, the alloc can't block. In the worst case, it is ok to | ||
| 434 | * fail. It would simply mean we can't clear the region. | ||
| 435 | * Does nothing to current sync context, but does mean | ||
| 436 | * the region will be re-sync'ed on a reload of the mirror | ||
| 437 | * even though it is in-sync. | ||
| 438 | */ | ||
| 439 | static void userspace_clear_region(struct dm_dirty_log *log, region_t region) | ||
| 440 | { | ||
| 441 | unsigned long flags; | ||
| 442 | struct log_c *lc = log->context; | ||
| 443 | struct flush_entry *fe; | ||
| 444 | |||
| 445 | /* | ||
| 446 | * If we fail to allocate, we skip the clearing of | ||
| 447 | * the region. This doesn't hurt us in any way, except | ||
| 448 | * to cause the region to be resync'ed when the | ||
| 449 | * device is activated next time. | ||
| 450 | */ | ||
| 451 | fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); | ||
| 452 | if (!fe) { | ||
| 453 | DMERR("Failed to allocate memory to clear region."); | ||
| 454 | return; | ||
| 455 | } | ||
| 456 | |||
| 457 | spin_lock_irqsave(&lc->flush_lock, flags); | ||
| 458 | fe->type = DM_ULOG_CLEAR_REGION; | ||
| 459 | fe->region = region; | ||
| 460 | list_add(&fe->list, &lc->flush_list); | ||
| 461 | spin_unlock_irqrestore(&lc->flush_lock, flags); | ||
| 462 | |||
| 463 | return; | ||
| 464 | } | ||
| 465 | |||
| 466 | /* | ||
| 467 | * userspace_get_resync_work | ||
| 468 | * | ||
| 469 | * Get a region that needs recovery. It is valid to return | ||
| 470 | * an error for this function. | ||
| 471 | * | ||
| 472 | * Returns: 1 if region filled, 0 if no work, <0 on error | ||
| 473 | */ | ||
| 474 | static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) | ||
| 475 | { | ||
| 476 | int r; | ||
| 477 | size_t rdata_size; | ||
| 478 | struct log_c *lc = log->context; | ||
| 479 | struct { | ||
| 480 | int64_t i; /* 64-bit for mix arch compatibility */ | ||
| 481 | region_t r; | ||
| 482 | } pkg; | ||
| 483 | |||
| 484 | if (lc->in_sync_hint >= lc->region_count) | ||
| 485 | return 0; | ||
| 486 | |||
| 487 | rdata_size = sizeof(pkg); | ||
| 488 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, | ||
| 489 | NULL, 0, | ||
| 490 | (char *)&pkg, &rdata_size); | ||
| 491 | |||
| 492 | *region = pkg.r; | ||
| 493 | return (r) ? r : (int)pkg.i; | ||
| 494 | } | ||
| 495 | |||
| 496 | /* | ||
| 497 | * userspace_set_region_sync | ||
| 498 | * | ||
| 499 | * Set the sync status of a given region. This function | ||
| 500 | * must not fail. | ||
| 501 | */ | ||
| 502 | static void userspace_set_region_sync(struct dm_dirty_log *log, | ||
| 503 | region_t region, int in_sync) | ||
| 504 | { | ||
| 505 | int r; | ||
| 506 | struct log_c *lc = log->context; | ||
| 507 | struct { | ||
| 508 | region_t r; | ||
| 509 | int64_t i; | ||
| 510 | } pkg; | ||
| 511 | |||
| 512 | pkg.r = region; | ||
| 513 | pkg.i = (int64_t)in_sync; | ||
| 514 | |||
| 515 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, | ||
| 516 | (char *)&pkg, sizeof(pkg), | ||
| 517 | NULL, NULL); | ||
| 518 | |||
| 519 | /* | ||
| 520 | * It would be nice to be able to report failures. | ||
| 521 | * However, it is easy emough to detect and resolve. | ||
| 522 | */ | ||
| 523 | return; | ||
| 524 | } | ||
| 525 | |||
| 526 | /* | ||
| 527 | * userspace_get_sync_count | ||
| 528 | * | ||
| 529 | * If there is any sort of failure when consulting the server, | ||
| 530 | * we assume that the sync count is zero. | ||
| 531 | * | ||
| 532 | * Returns: sync count on success, 0 on failure | ||
| 533 | */ | ||
| 534 | static region_t userspace_get_sync_count(struct dm_dirty_log *log) | ||
| 535 | { | ||
| 536 | int r; | ||
| 537 | size_t rdata_size; | ||
| 538 | uint64_t sync_count; | ||
| 539 | struct log_c *lc = log->context; | ||
| 540 | |||
| 541 | rdata_size = sizeof(sync_count); | ||
| 542 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, | ||
| 543 | NULL, 0, | ||
| 544 | (char *)&sync_count, &rdata_size); | ||
| 545 | |||
| 546 | if (r) | ||
| 547 | return 0; | ||
| 548 | |||
| 549 | if (sync_count >= lc->region_count) | ||
| 550 | lc->in_sync_hint = lc->region_count; | ||
| 551 | |||
| 552 | return (region_t)sync_count; | ||
| 553 | } | ||
| 554 | |||
| 555 | /* | ||
| 556 | * userspace_status | ||
| 557 | * | ||
| 558 | * Returns: amount of space consumed | ||
| 559 | */ | ||
| 560 | static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, | ||
| 561 | char *result, unsigned maxlen) | ||
| 562 | { | ||
| 563 | int r = 0; | ||
| 564 | size_t sz = (size_t)maxlen; | ||
| 565 | struct log_c *lc = log->context; | ||
| 566 | |||
| 567 | switch (status_type) { | ||
| 568 | case STATUSTYPE_INFO: | ||
| 569 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, | ||
| 570 | NULL, 0, | ||
| 571 | result, &sz); | ||
| 572 | |||
| 573 | if (r) { | ||
| 574 | sz = 0; | ||
| 575 | DMEMIT("%s 1 COM_FAILURE", log->type->name); | ||
| 576 | } | ||
| 577 | break; | ||
| 578 | case STATUSTYPE_TABLE: | ||
| 579 | sz = 0; | ||
| 580 | DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1, | ||
| 581 | lc->uuid, lc->usr_argv_str); | ||
| 582 | break; | ||
| 583 | } | ||
| 584 | return (r) ? 0 : (int)sz; | ||
| 585 | } | ||
| 586 | |||
| 587 | /* | ||
| 588 | * userspace_is_remote_recovering | ||
| 589 | * | ||
| 590 | * Returns: 1 if region recovering, 0 otherwise | ||
| 591 | */ | ||
| 592 | static int userspace_is_remote_recovering(struct dm_dirty_log *log, | ||
| 593 | region_t region) | ||
| 594 | { | ||
| 595 | int r; | ||
| 596 | uint64_t region64 = region; | ||
| 597 | struct log_c *lc = log->context; | ||
| 598 | static unsigned long long limit; | ||
| 599 | struct { | ||
| 600 | int64_t is_recovering; | ||
| 601 | uint64_t in_sync_hint; | ||
| 602 | } pkg; | ||
| 603 | size_t rdata_size = sizeof(pkg); | ||
| 604 | |||
| 605 | /* | ||
| 606 | * Once the mirror has been reported to be in-sync, | ||
| 607 | * it will never again ask for recovery work. So, | ||
| 608 | * we can safely say there is not a remote machine | ||
| 609 | * recovering if the device is in-sync. (in_sync_hint | ||
| 610 | * must be reset at resume time.) | ||
| 611 | */ | ||
| 612 | if (region < lc->in_sync_hint) | ||
| 613 | return 0; | ||
| 614 | else if (jiffies < limit) | ||
| 615 | return 1; | ||
| 616 | |||
| 617 | limit = jiffies + (HZ / 4); | ||
| 618 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, | ||
| 619 | (char *)®ion64, sizeof(region64), | ||
| 620 | (char *)&pkg, &rdata_size); | ||
| 621 | if (r) | ||
| 622 | return 1; | ||
| 623 | |||
| 624 | lc->in_sync_hint = pkg.in_sync_hint; | ||
| 625 | |||
| 626 | return (int)pkg.is_recovering; | ||
| 627 | } | ||
| 628 | |||
| 629 | static struct dm_dirty_log_type _userspace_type = { | ||
| 630 | .name = "userspace", | ||
| 631 | .module = THIS_MODULE, | ||
| 632 | .ctr = userspace_ctr, | ||
| 633 | .dtr = userspace_dtr, | ||
| 634 | .presuspend = userspace_presuspend, | ||
| 635 | .postsuspend = userspace_postsuspend, | ||
| 636 | .resume = userspace_resume, | ||
| 637 | .get_region_size = userspace_get_region_size, | ||
| 638 | .is_clean = userspace_is_clean, | ||
| 639 | .in_sync = userspace_in_sync, | ||
| 640 | .flush = userspace_flush, | ||
| 641 | .mark_region = userspace_mark_region, | ||
| 642 | .clear_region = userspace_clear_region, | ||
| 643 | .get_resync_work = userspace_get_resync_work, | ||
| 644 | .set_region_sync = userspace_set_region_sync, | ||
| 645 | .get_sync_count = userspace_get_sync_count, | ||
| 646 | .status = userspace_status, | ||
| 647 | .is_remote_recovering = userspace_is_remote_recovering, | ||
| 648 | }; | ||
| 649 | |||
| 650 | static int __init userspace_dirty_log_init(void) | ||
| 651 | { | ||
| 652 | int r = 0; | ||
| 653 | |||
| 654 | flush_entry_pool = mempool_create(100, flush_entry_alloc, | ||
| 655 | flush_entry_free, NULL); | ||
| 656 | |||
| 657 | if (!flush_entry_pool) { | ||
| 658 | DMWARN("Unable to create flush_entry_pool: No memory."); | ||
| 659 | return -ENOMEM; | ||
| 660 | } | ||
| 661 | |||
| 662 | r = dm_ulog_tfr_init(); | ||
| 663 | if (r) { | ||
| 664 | DMWARN("Unable to initialize userspace log communications"); | ||
| 665 | mempool_destroy(flush_entry_pool); | ||
| 666 | return r; | ||
| 667 | } | ||
| 668 | |||
| 669 | r = dm_dirty_log_type_register(&_userspace_type); | ||
| 670 | if (r) { | ||
| 671 | DMWARN("Couldn't register userspace dirty log type"); | ||
| 672 | dm_ulog_tfr_exit(); | ||
| 673 | mempool_destroy(flush_entry_pool); | ||
| 674 | return r; | ||
| 675 | } | ||
| 676 | |||
| 677 | DMINFO("version 1.0.0 loaded"); | ||
| 678 | return 0; | ||
| 679 | } | ||
| 680 | |||
| 681 | static void __exit userspace_dirty_log_exit(void) | ||
| 682 | { | ||
| 683 | dm_dirty_log_type_unregister(&_userspace_type); | ||
| 684 | dm_ulog_tfr_exit(); | ||
| 685 | mempool_destroy(flush_entry_pool); | ||
| 686 | |||
| 687 | DMINFO("version 1.0.0 unloaded"); | ||
| 688 | return; | ||
| 689 | } | ||
| 690 | |||
| 691 | module_init(userspace_dirty_log_init); | ||
| 692 | module_exit(userspace_dirty_log_exit); | ||
| 693 | |||
| 694 | MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); | ||
| 695 | MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); | ||
| 696 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c new file mode 100644 index 000000000000..0ca1ee768a1f --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.c | |||
| @@ -0,0 +1,276 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the LGPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/kernel.h> | ||
| 8 | #include <linux/module.h> | ||
| 9 | #include <net/sock.h> | ||
| 10 | #include <linux/workqueue.h> | ||
| 11 | #include <linux/connector.h> | ||
| 12 | #include <linux/device-mapper.h> | ||
| 13 | #include <linux/dm-log-userspace.h> | ||
| 14 | |||
| 15 | #include "dm-log-userspace-transfer.h" | ||
| 16 | |||
| 17 | static uint32_t dm_ulog_seq; | ||
| 18 | |||
| 19 | /* | ||
| 20 | * Netlink/Connector is an unreliable protocol. How long should | ||
| 21 | * we wait for a response before assuming it was lost and retrying? | ||
| 22 | * (If we do receive a response after this time, it will be discarded | ||
| 23 | * and the response to the resent request will be waited for. | ||
| 24 | */ | ||
| 25 | #define DM_ULOG_RETRY_TIMEOUT (15 * HZ) | ||
| 26 | |||
| 27 | /* | ||
| 28 | * Pre-allocated space for speed | ||
| 29 | */ | ||
| 30 | #define DM_ULOG_PREALLOCED_SIZE 512 | ||
| 31 | static struct cn_msg *prealloced_cn_msg; | ||
| 32 | static struct dm_ulog_request *prealloced_ulog_tfr; | ||
| 33 | |||
| 34 | static struct cb_id ulog_cn_id = { | ||
| 35 | .idx = CN_IDX_DM, | ||
| 36 | .val = CN_VAL_DM_USERSPACE_LOG | ||
| 37 | }; | ||
| 38 | |||
| 39 | static DEFINE_MUTEX(dm_ulog_lock); | ||
| 40 | |||
| 41 | struct receiving_pkg { | ||
| 42 | struct list_head list; | ||
| 43 | struct completion complete; | ||
| 44 | |||
| 45 | uint32_t seq; | ||
| 46 | |||
| 47 | int error; | ||
| 48 | size_t *data_size; | ||
| 49 | char *data; | ||
| 50 | }; | ||
| 51 | |||
| 52 | static DEFINE_SPINLOCK(receiving_list_lock); | ||
| 53 | static struct list_head receiving_list; | ||
| 54 | |||
| 55 | static int dm_ulog_sendto_server(struct dm_ulog_request *tfr) | ||
| 56 | { | ||
| 57 | int r; | ||
| 58 | struct cn_msg *msg = prealloced_cn_msg; | ||
| 59 | |||
| 60 | memset(msg, 0, sizeof(struct cn_msg)); | ||
| 61 | |||
| 62 | msg->id.idx = ulog_cn_id.idx; | ||
| 63 | msg->id.val = ulog_cn_id.val; | ||
| 64 | msg->ack = 0; | ||
| 65 | msg->seq = tfr->seq; | ||
| 66 | msg->len = sizeof(struct dm_ulog_request) + tfr->data_size; | ||
| 67 | |||
| 68 | r = cn_netlink_send(msg, 0, gfp_any()); | ||
| 69 | |||
| 70 | return r; | ||
| 71 | } | ||
| 72 | |||
| 73 | /* | ||
| 74 | * Parameters for this function can be either msg or tfr, but not | ||
| 75 | * both. This function fills in the reply for a waiting request. | ||
| 76 | * If just msg is given, then the reply is simply an ACK from userspace | ||
| 77 | * that the request was received. | ||
| 78 | * | ||
| 79 | * Returns: 0 on success, -ENOENT on failure | ||
| 80 | */ | ||
| 81 | static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr) | ||
| 82 | { | ||
| 83 | uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0; | ||
| 84 | struct receiving_pkg *pkg; | ||
| 85 | |||
| 86 | /* | ||
| 87 | * The 'receiving_pkg' entries in this list are statically | ||
| 88 | * allocated on the stack in 'dm_consult_userspace'. | ||
| 89 | * Each process that is waiting for a reply from the user | ||
| 90 | * space server will have an entry in this list. | ||
| 91 | * | ||
| 92 | * We are safe to do it this way because the stack space | ||
| 93 | * is unique to each process, but still addressable by | ||
| 94 | * other processes. | ||
| 95 | */ | ||
| 96 | list_for_each_entry(pkg, &receiving_list, list) { | ||
| 97 | if (rtn_seq != pkg->seq) | ||
| 98 | continue; | ||
| 99 | |||
| 100 | if (msg) { | ||
| 101 | pkg->error = -msg->ack; | ||
| 102 | /* | ||
| 103 | * If we are trying again, we will need to know our | ||
| 104 | * storage capacity. Otherwise, along with the | ||
| 105 | * error code, we make explicit that we have no data. | ||
| 106 | */ | ||
| 107 | if (pkg->error != -EAGAIN) | ||
| 108 | *(pkg->data_size) = 0; | ||
| 109 | } else if (tfr->data_size > *(pkg->data_size)) { | ||
| 110 | DMERR("Insufficient space to receive package [%u] " | ||
| 111 | "(%u vs %lu)", tfr->request_type, | ||
| 112 | tfr->data_size, *(pkg->data_size)); | ||
| 113 | |||
| 114 | *(pkg->data_size) = 0; | ||
| 115 | pkg->error = -ENOSPC; | ||
| 116 | } else { | ||
| 117 | pkg->error = tfr->error; | ||
| 118 | memcpy(pkg->data, tfr->data, tfr->data_size); | ||
| 119 | *(pkg->data_size) = tfr->data_size; | ||
| 120 | } | ||
| 121 | complete(&pkg->complete); | ||
| 122 | return 0; | ||
| 123 | } | ||
| 124 | |||
| 125 | return -ENOENT; | ||
| 126 | } | ||
| 127 | |||
| 128 | /* | ||
| 129 | * This is the connector callback that delivers data | ||
| 130 | * that was sent from userspace. | ||
| 131 | */ | ||
| 132 | static void cn_ulog_callback(void *data) | ||
| 133 | { | ||
| 134 | struct cn_msg *msg = (struct cn_msg *)data; | ||
| 135 | struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); | ||
| 136 | |||
| 137 | spin_lock(&receiving_list_lock); | ||
| 138 | if (msg->len == 0) | ||
| 139 | fill_pkg(msg, NULL); | ||
| 140 | else if (msg->len < sizeof(*tfr)) | ||
| 141 | DMERR("Incomplete message received (expected %u, got %u): [%u]", | ||
| 142 | (unsigned)sizeof(*tfr), msg->len, msg->seq); | ||
| 143 | else | ||
| 144 | fill_pkg(NULL, tfr); | ||
| 145 | spin_unlock(&receiving_list_lock); | ||
| 146 | } | ||
| 147 | |||
| 148 | /** | ||
| 149 | * dm_consult_userspace | ||
| 150 | * @uuid: log's uuid (must be DM_UUID_LEN in size) | ||
| 151 | * @request_type: found in include/linux/dm-log-userspace.h | ||
| 152 | * @data: data to tx to the server | ||
| 153 | * @data_size: size of data in bytes | ||
| 154 | * @rdata: place to put return data from server | ||
| 155 | * @rdata_size: value-result (amount of space given/amount of space used) | ||
| 156 | * | ||
| 157 | * rdata_size is undefined on failure. | ||
| 158 | * | ||
| 159 | * Memory used to communicate with userspace is zero'ed | ||
| 160 | * before populating to ensure that no unwanted bits leak | ||
| 161 | * from kernel space to user-space. All userspace log communications | ||
| 162 | * between kernel and user space go through this function. | ||
| 163 | * | ||
| 164 | * Returns: 0 on success, -EXXX on failure | ||
| 165 | **/ | ||
| 166 | int dm_consult_userspace(const char *uuid, int request_type, | ||
| 167 | char *data, size_t data_size, | ||
| 168 | char *rdata, size_t *rdata_size) | ||
| 169 | { | ||
| 170 | int r = 0; | ||
| 171 | size_t dummy = 0; | ||
| 172 | int overhead_size = | ||
| 173 | sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg); | ||
| 174 | struct dm_ulog_request *tfr = prealloced_ulog_tfr; | ||
| 175 | struct receiving_pkg pkg; | ||
| 176 | |||
| 177 | if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { | ||
| 178 | DMINFO("Size of tfr exceeds preallocated size"); | ||
| 179 | return -EINVAL; | ||
| 180 | } | ||
| 181 | |||
| 182 | if (!rdata_size) | ||
| 183 | rdata_size = &dummy; | ||
| 184 | resend: | ||
| 185 | /* | ||
| 186 | * We serialize the sending of requests so we can | ||
| 187 | * use the preallocated space. | ||
| 188 | */ | ||
| 189 | mutex_lock(&dm_ulog_lock); | ||
| 190 | |||
| 191 | memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); | ||
| 192 | memcpy(tfr->uuid, uuid, DM_UUID_LEN); | ||
| 193 | tfr->seq = dm_ulog_seq++; | ||
| 194 | |||
| 195 | /* | ||
| 196 | * Must be valid request type (all other bits set to | ||
| 197 | * zero). This reserves other bits for possible future | ||
| 198 | * use. | ||
| 199 | */ | ||
| 200 | tfr->request_type = request_type & DM_ULOG_REQUEST_MASK; | ||
| 201 | |||
| 202 | tfr->data_size = data_size; | ||
| 203 | if (data && data_size) | ||
| 204 | memcpy(tfr->data, data, data_size); | ||
| 205 | |||
| 206 | memset(&pkg, 0, sizeof(pkg)); | ||
| 207 | init_completion(&pkg.complete); | ||
| 208 | pkg.seq = tfr->seq; | ||
| 209 | pkg.data_size = rdata_size; | ||
| 210 | pkg.data = rdata; | ||
| 211 | spin_lock(&receiving_list_lock); | ||
| 212 | list_add(&(pkg.list), &receiving_list); | ||
| 213 | spin_unlock(&receiving_list_lock); | ||
| 214 | |||
| 215 | r = dm_ulog_sendto_server(tfr); | ||
| 216 | |||
| 217 | mutex_unlock(&dm_ulog_lock); | ||
| 218 | |||
| 219 | if (r) { | ||
| 220 | DMERR("Unable to send log request [%u] to userspace: %d", | ||
| 221 | request_type, r); | ||
| 222 | spin_lock(&receiving_list_lock); | ||
| 223 | list_del_init(&(pkg.list)); | ||
| 224 | spin_unlock(&receiving_list_lock); | ||
| 225 | |||
| 226 | goto out; | ||
| 227 | } | ||
| 228 | |||
| 229 | r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT); | ||
| 230 | spin_lock(&receiving_list_lock); | ||
| 231 | list_del_init(&(pkg.list)); | ||
| 232 | spin_unlock(&receiving_list_lock); | ||
| 233 | if (!r) { | ||
| 234 | DMWARN("[%s] Request timed out: [%u/%u] - retrying", | ||
| 235 | (strlen(uuid) > 8) ? | ||
| 236 | (uuid + (strlen(uuid) - 8)) : (uuid), | ||
| 237 | request_type, pkg.seq); | ||
| 238 | goto resend; | ||
| 239 | } | ||
| 240 | |||
| 241 | r = pkg.error; | ||
| 242 | if (r == -EAGAIN) | ||
| 243 | goto resend; | ||
| 244 | |||
| 245 | out: | ||
| 246 | return r; | ||
| 247 | } | ||
| 248 | |||
| 249 | int dm_ulog_tfr_init(void) | ||
| 250 | { | ||
| 251 | int r; | ||
| 252 | void *prealloced; | ||
| 253 | |||
| 254 | INIT_LIST_HEAD(&receiving_list); | ||
| 255 | |||
| 256 | prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL); | ||
| 257 | if (!prealloced) | ||
| 258 | return -ENOMEM; | ||
| 259 | |||
| 260 | prealloced_cn_msg = prealloced; | ||
| 261 | prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg); | ||
| 262 | |||
| 263 | r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); | ||
| 264 | if (r) { | ||
| 265 | cn_del_callback(&ulog_cn_id); | ||
| 266 | return r; | ||
| 267 | } | ||
| 268 | |||
| 269 | return 0; | ||
| 270 | } | ||
| 271 | |||
| 272 | void dm_ulog_tfr_exit(void) | ||
| 273 | { | ||
| 274 | cn_del_callback(&ulog_cn_id); | ||
| 275 | kfree(prealloced_cn_msg); | ||
| 276 | } | ||
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h new file mode 100644 index 000000000000..c26d8e4e2710 --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.h | |||
| @@ -0,0 +1,18 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the LGPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #ifndef __DM_LOG_USERSPACE_TRANSFER_H__ | ||
| 8 | #define __DM_LOG_USERSPACE_TRANSFER_H__ | ||
| 9 | |||
| 10 | #define DM_MSG_PREFIX "dm-log-userspace" | ||
| 11 | |||
| 12 | int dm_ulog_tfr_init(void); | ||
| 13 | void dm_ulog_tfr_exit(void); | ||
| 14 | int dm_consult_userspace(const char *uuid, int request_type, | ||
| 15 | char *data, size_t data_size, | ||
| 16 | char *rdata, size_t *rdata_size); | ||
| 17 | |||
| 18 | #endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */ | ||
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index be233bc4d917..9443896ede07 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
| @@ -412,10 +412,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
| 412 | /* | 412 | /* |
| 413 | * Buffer holds both header and bitset. | 413 | * Buffer holds both header and bitset. |
| 414 | */ | 414 | */ |
| 415 | buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + | 415 | buf_size = |
| 416 | bitset_size, ti->limits.hardsect_size); | 416 | dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size, |
| 417 | bdev_logical_block_size(lc->header_location. | ||
| 418 | bdev)); | ||
| 417 | 419 | ||
| 418 | if (buf_size > dev->bdev->bd_inode->i_size) { | 420 | if (buf_size > i_size_read(dev->bdev->bd_inode)) { |
| 419 | DMWARN("log device %s too small: need %llu bytes", | 421 | DMWARN("log device %s too small: need %llu bytes", |
| 420 | dev->name, (unsigned long long)buf_size); | 422 | dev->name, (unsigned long long)buf_size); |
| 421 | kfree(lc); | 423 | kfree(lc); |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6a386ab4f7eb..c70604a20897 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
| @@ -8,7 +8,6 @@ | |||
| 8 | #include <linux/device-mapper.h> | 8 | #include <linux/device-mapper.h> |
| 9 | 9 | ||
| 10 | #include "dm-path-selector.h" | 10 | #include "dm-path-selector.h" |
| 11 | #include "dm-bio-record.h" | ||
| 12 | #include "dm-uevent.h" | 11 | #include "dm-uevent.h" |
| 13 | 12 | ||
| 14 | #include <linux/ctype.h> | 13 | #include <linux/ctype.h> |
| @@ -35,6 +34,7 @@ struct pgpath { | |||
| 35 | 34 | ||
| 36 | struct dm_path path; | 35 | struct dm_path path; |
| 37 | struct work_struct deactivate_path; | 36 | struct work_struct deactivate_path; |
| 37 | struct work_struct activate_path; | ||
| 38 | }; | 38 | }; |
| 39 | 39 | ||
| 40 | #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) | 40 | #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) |
| @@ -64,8 +64,6 @@ struct multipath { | |||
| 64 | spinlock_t lock; | 64 | spinlock_t lock; |
| 65 | 65 | ||
| 66 | const char *hw_handler_name; | 66 | const char *hw_handler_name; |
| 67 | struct work_struct activate_path; | ||
| 68 | struct pgpath *pgpath_to_activate; | ||
| 69 | unsigned nr_priority_groups; | 67 | unsigned nr_priority_groups; |
| 70 | struct list_head priority_groups; | 68 | struct list_head priority_groups; |
| 71 | unsigned pg_init_required; /* pg_init needs calling? */ | 69 | unsigned pg_init_required; /* pg_init needs calling? */ |
| @@ -84,7 +82,7 @@ struct multipath { | |||
| 84 | unsigned pg_init_count; /* Number of times pg_init called */ | 82 | unsigned pg_init_count; /* Number of times pg_init called */ |
| 85 | 83 | ||
| 86 | struct work_struct process_queued_ios; | 84 | struct work_struct process_queued_ios; |
| 87 | struct bio_list queued_ios; | 85 | struct list_head queued_ios; |
| 88 | unsigned queue_size; | 86 | unsigned queue_size; |
| 89 | 87 | ||
| 90 | struct work_struct trigger_event; | 88 | struct work_struct trigger_event; |
| @@ -101,7 +99,7 @@ struct multipath { | |||
| 101 | */ | 99 | */ |
| 102 | struct dm_mpath_io { | 100 | struct dm_mpath_io { |
| 103 | struct pgpath *pgpath; | 101 | struct pgpath *pgpath; |
| 104 | struct dm_bio_details details; | 102 | size_t nr_bytes; |
| 105 | }; | 103 | }; |
| 106 | 104 | ||
| 107 | typedef int (*action_fn) (struct pgpath *pgpath); | 105 | typedef int (*action_fn) (struct pgpath *pgpath); |
| @@ -128,6 +126,7 @@ static struct pgpath *alloc_pgpath(void) | |||
| 128 | if (pgpath) { | 126 | if (pgpath) { |
| 129 | pgpath->is_active = 1; | 127 | pgpath->is_active = 1; |
| 130 | INIT_WORK(&pgpath->deactivate_path, deactivate_path); | 128 | INIT_WORK(&pgpath->deactivate_path, deactivate_path); |
| 129 | INIT_WORK(&pgpath->activate_path, activate_path); | ||
| 131 | } | 130 | } |
| 132 | 131 | ||
| 133 | return pgpath; | 132 | return pgpath; |
| @@ -160,7 +159,6 @@ static struct priority_group *alloc_priority_group(void) | |||
| 160 | 159 | ||
| 161 | static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) | 160 | static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) |
| 162 | { | 161 | { |
| 163 | unsigned long flags; | ||
| 164 | struct pgpath *pgpath, *tmp; | 162 | struct pgpath *pgpath, *tmp; |
| 165 | struct multipath *m = ti->private; | 163 | struct multipath *m = ti->private; |
| 166 | 164 | ||
| @@ -169,10 +167,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) | |||
| 169 | if (m->hw_handler_name) | 167 | if (m->hw_handler_name) |
| 170 | scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); | 168 | scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); |
| 171 | dm_put_device(ti, pgpath->path.dev); | 169 | dm_put_device(ti, pgpath->path.dev); |
| 172 | spin_lock_irqsave(&m->lock, flags); | ||
| 173 | if (m->pgpath_to_activate == pgpath) | ||
| 174 | m->pgpath_to_activate = NULL; | ||
| 175 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 176 | free_pgpath(pgpath); | 170 | free_pgpath(pgpath); |
| 177 | } | 171 | } |
| 178 | } | 172 | } |
| @@ -198,11 +192,11 @@ static struct multipath *alloc_multipath(struct dm_target *ti) | |||
| 198 | m = kzalloc(sizeof(*m), GFP_KERNEL); | 192 | m = kzalloc(sizeof(*m), GFP_KERNEL); |
| 199 | if (m) { | 193 | if (m) { |
| 200 | INIT_LIST_HEAD(&m->priority_groups); | 194 | INIT_LIST_HEAD(&m->priority_groups); |
| 195 | INIT_LIST_HEAD(&m->queued_ios); | ||
| 201 | spin_lock_init(&m->lock); | 196 | spin_lock_init(&m->lock); |
| 202 | m->queue_io = 1; | 197 | m->queue_io = 1; |
| 203 | INIT_WORK(&m->process_queued_ios, process_queued_ios); | 198 | INIT_WORK(&m->process_queued_ios, process_queued_ios); |
| 204 | INIT_WORK(&m->trigger_event, trigger_event); | 199 | INIT_WORK(&m->trigger_event, trigger_event); |
| 205 | INIT_WORK(&m->activate_path, activate_path); | ||
| 206 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); | 200 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); |
| 207 | if (!m->mpio_pool) { | 201 | if (!m->mpio_pool) { |
| 208 | kfree(m); | 202 | kfree(m); |
| @@ -250,11 +244,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) | |||
| 250 | m->pg_init_count = 0; | 244 | m->pg_init_count = 0; |
| 251 | } | 245 | } |
| 252 | 246 | ||
| 253 | static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) | 247 | static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, |
| 248 | size_t nr_bytes) | ||
| 254 | { | 249 | { |
| 255 | struct dm_path *path; | 250 | struct dm_path *path; |
| 256 | 251 | ||
| 257 | path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); | 252 | path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); |
| 258 | if (!path) | 253 | if (!path) |
| 259 | return -ENXIO; | 254 | return -ENXIO; |
| 260 | 255 | ||
| @@ -266,7 +261,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) | |||
| 266 | return 0; | 261 | return 0; |
| 267 | } | 262 | } |
| 268 | 263 | ||
| 269 | static void __choose_pgpath(struct multipath *m) | 264 | static void __choose_pgpath(struct multipath *m, size_t nr_bytes) |
| 270 | { | 265 | { |
| 271 | struct priority_group *pg; | 266 | struct priority_group *pg; |
| 272 | unsigned bypassed = 1; | 267 | unsigned bypassed = 1; |
| @@ -278,12 +273,12 @@ static void __choose_pgpath(struct multipath *m) | |||
| 278 | if (m->next_pg) { | 273 | if (m->next_pg) { |
| 279 | pg = m->next_pg; | 274 | pg = m->next_pg; |
| 280 | m->next_pg = NULL; | 275 | m->next_pg = NULL; |
| 281 | if (!__choose_path_in_pg(m, pg)) | 276 | if (!__choose_path_in_pg(m, pg, nr_bytes)) |
| 282 | return; | 277 | return; |
| 283 | } | 278 | } |
| 284 | 279 | ||
| 285 | /* Don't change PG until it has no remaining paths */ | 280 | /* Don't change PG until it has no remaining paths */ |
| 286 | if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) | 281 | if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) |
| 287 | return; | 282 | return; |
| 288 | 283 | ||
| 289 | /* | 284 | /* |
| @@ -295,7 +290,7 @@ static void __choose_pgpath(struct multipath *m) | |||
| 295 | list_for_each_entry(pg, &m->priority_groups, list) { | 290 | list_for_each_entry(pg, &m->priority_groups, list) { |
| 296 | if (pg->bypassed == bypassed) | 291 | if (pg->bypassed == bypassed) |
| 297 | continue; | 292 | continue; |
| 298 | if (!__choose_path_in_pg(m, pg)) | 293 | if (!__choose_path_in_pg(m, pg, nr_bytes)) |
| 299 | return; | 294 | return; |
| 300 | } | 295 | } |
| 301 | } while (bypassed--); | 296 | } while (bypassed--); |
| @@ -322,19 +317,21 @@ static int __must_push_back(struct multipath *m) | |||
| 322 | dm_noflush_suspending(m->ti)); | 317 | dm_noflush_suspending(m->ti)); |
| 323 | } | 318 | } |
| 324 | 319 | ||
| 325 | static int map_io(struct multipath *m, struct bio *bio, | 320 | static int map_io(struct multipath *m, struct request *clone, |
| 326 | struct dm_mpath_io *mpio, unsigned was_queued) | 321 | struct dm_mpath_io *mpio, unsigned was_queued) |
| 327 | { | 322 | { |
| 328 | int r = DM_MAPIO_REMAPPED; | 323 | int r = DM_MAPIO_REMAPPED; |
| 324 | size_t nr_bytes = blk_rq_bytes(clone); | ||
| 329 | unsigned long flags; | 325 | unsigned long flags; |
| 330 | struct pgpath *pgpath; | 326 | struct pgpath *pgpath; |
| 327 | struct block_device *bdev; | ||
| 331 | 328 | ||
| 332 | spin_lock_irqsave(&m->lock, flags); | 329 | spin_lock_irqsave(&m->lock, flags); |
| 333 | 330 | ||
| 334 | /* Do we need to select a new pgpath? */ | 331 | /* Do we need to select a new pgpath? */ |
| 335 | if (!m->current_pgpath || | 332 | if (!m->current_pgpath || |
| 336 | (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) | 333 | (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) |
| 337 | __choose_pgpath(m); | 334 | __choose_pgpath(m, nr_bytes); |
| 338 | 335 | ||
| 339 | pgpath = m->current_pgpath; | 336 | pgpath = m->current_pgpath; |
| 340 | 337 | ||
| @@ -344,21 +341,28 @@ static int map_io(struct multipath *m, struct bio *bio, | |||
| 344 | if ((pgpath && m->queue_io) || | 341 | if ((pgpath && m->queue_io) || |
| 345 | (!pgpath && m->queue_if_no_path)) { | 342 | (!pgpath && m->queue_if_no_path)) { |
| 346 | /* Queue for the daemon to resubmit */ | 343 | /* Queue for the daemon to resubmit */ |
| 347 | bio_list_add(&m->queued_ios, bio); | 344 | list_add_tail(&clone->queuelist, &m->queued_ios); |
| 348 | m->queue_size++; | 345 | m->queue_size++; |
| 349 | if ((m->pg_init_required && !m->pg_init_in_progress) || | 346 | if ((m->pg_init_required && !m->pg_init_in_progress) || |
| 350 | !m->queue_io) | 347 | !m->queue_io) |
| 351 | queue_work(kmultipathd, &m->process_queued_ios); | 348 | queue_work(kmultipathd, &m->process_queued_ios); |
| 352 | pgpath = NULL; | 349 | pgpath = NULL; |
| 353 | r = DM_MAPIO_SUBMITTED; | 350 | r = DM_MAPIO_SUBMITTED; |
| 354 | } else if (pgpath) | 351 | } else if (pgpath) { |
| 355 | bio->bi_bdev = pgpath->path.dev->bdev; | 352 | bdev = pgpath->path.dev->bdev; |
| 356 | else if (__must_push_back(m)) | 353 | clone->q = bdev_get_queue(bdev); |
| 354 | clone->rq_disk = bdev->bd_disk; | ||
| 355 | } else if (__must_push_back(m)) | ||
| 357 | r = DM_MAPIO_REQUEUE; | 356 | r = DM_MAPIO_REQUEUE; |
| 358 | else | 357 | else |
| 359 | r = -EIO; /* Failed */ | 358 | r = -EIO; /* Failed */ |
| 360 | 359 | ||
| 361 | mpio->pgpath = pgpath; | 360 | mpio->pgpath = pgpath; |
| 361 | mpio->nr_bytes = nr_bytes; | ||
| 362 | |||
| 363 | if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io) | ||
| 364 | pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, | ||
| 365 | nr_bytes); | ||
| 362 | 366 | ||
| 363 | spin_unlock_irqrestore(&m->lock, flags); | 367 | spin_unlock_irqrestore(&m->lock, flags); |
| 364 | 368 | ||
| @@ -396,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m) | |||
| 396 | { | 400 | { |
| 397 | int r; | 401 | int r; |
| 398 | unsigned long flags; | 402 | unsigned long flags; |
| 399 | struct bio *bio = NULL, *next; | ||
| 400 | struct dm_mpath_io *mpio; | 403 | struct dm_mpath_io *mpio; |
| 401 | union map_info *info; | 404 | union map_info *info; |
| 405 | struct request *clone, *n; | ||
| 406 | LIST_HEAD(cl); | ||
| 402 | 407 | ||
| 403 | spin_lock_irqsave(&m->lock, flags); | 408 | spin_lock_irqsave(&m->lock, flags); |
| 404 | bio = bio_list_get(&m->queued_ios); | 409 | list_splice_init(&m->queued_ios, &cl); |
| 405 | spin_unlock_irqrestore(&m->lock, flags); | 410 | spin_unlock_irqrestore(&m->lock, flags); |
| 406 | 411 | ||
| 407 | while (bio) { | 412 | list_for_each_entry_safe(clone, n, &cl, queuelist) { |
| 408 | next = bio->bi_next; | 413 | list_del_init(&clone->queuelist); |
| 409 | bio->bi_next = NULL; | ||
| 410 | 414 | ||
| 411 | info = dm_get_mapinfo(bio); | 415 | info = dm_get_rq_mapinfo(clone); |
| 412 | mpio = info->ptr; | 416 | mpio = info->ptr; |
| 413 | 417 | ||
| 414 | r = map_io(m, bio, mpio, 1); | 418 | r = map_io(m, clone, mpio, 1); |
| 415 | if (r < 0) | 419 | if (r < 0) { |
| 416 | bio_endio(bio, r); | 420 | mempool_free(mpio, m->mpio_pool); |
| 417 | else if (r == DM_MAPIO_REMAPPED) | 421 | dm_kill_unmapped_request(clone, r); |
| 418 | generic_make_request(bio); | 422 | } else if (r == DM_MAPIO_REMAPPED) |
| 419 | else if (r == DM_MAPIO_REQUEUE) | 423 | dm_dispatch_request(clone); |
| 420 | bio_endio(bio, -EIO); | 424 | else if (r == DM_MAPIO_REQUEUE) { |
| 421 | 425 | mempool_free(mpio, m->mpio_pool); | |
| 422 | bio = next; | 426 | dm_requeue_unmapped_request(clone); |
| 427 | } | ||
| 423 | } | 428 | } |
| 424 | } | 429 | } |
| 425 | 430 | ||
| @@ -427,8 +432,8 @@ static void process_queued_ios(struct work_struct *work) | |||
| 427 | { | 432 | { |
| 428 | struct multipath *m = | 433 | struct multipath *m = |
| 429 | container_of(work, struct multipath, process_queued_ios); | 434 | container_of(work, struct multipath, process_queued_ios); |
| 430 | struct pgpath *pgpath = NULL; | 435 | struct pgpath *pgpath = NULL, *tmp; |
| 431 | unsigned init_required = 0, must_queue = 1; | 436 | unsigned must_queue = 1; |
| 432 | unsigned long flags; | 437 | unsigned long flags; |
| 433 | 438 | ||
| 434 | spin_lock_irqsave(&m->lock, flags); | 439 | spin_lock_irqsave(&m->lock, flags); |
| @@ -437,7 +442,7 @@ static void process_queued_ios(struct work_struct *work) | |||
| 437 | goto out; | 442 | goto out; |
| 438 | 443 | ||
| 439 | if (!m->current_pgpath) | 444 | if (!m->current_pgpath) |
| 440 | __choose_pgpath(m); | 445 | __choose_pgpath(m, 0); |
| 441 | 446 | ||
| 442 | pgpath = m->current_pgpath; | 447 | pgpath = m->current_pgpath; |
| 443 | 448 | ||
| @@ -446,19 +451,15 @@ static void process_queued_ios(struct work_struct *work) | |||
| 446 | must_queue = 0; | 451 | must_queue = 0; |
| 447 | 452 | ||
| 448 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { | 453 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { |
| 449 | m->pgpath_to_activate = pgpath; | ||
| 450 | m->pg_init_count++; | 454 | m->pg_init_count++; |
| 451 | m->pg_init_required = 0; | 455 | m->pg_init_required = 0; |
| 452 | m->pg_init_in_progress = 1; | 456 | list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) { |
| 453 | init_required = 1; | 457 | if (queue_work(kmpath_handlerd, &tmp->activate_path)) |
| 458 | m->pg_init_in_progress++; | ||
| 459 | } | ||
| 454 | } | 460 | } |
| 455 | |||
| 456 | out: | 461 | out: |
| 457 | spin_unlock_irqrestore(&m->lock, flags); | 462 | spin_unlock_irqrestore(&m->lock, flags); |
| 458 | |||
| 459 | if (init_required) | ||
| 460 | queue_work(kmpath_handlerd, &m->activate_path); | ||
| 461 | |||
| 462 | if (!must_queue) | 463 | if (!must_queue) |
| 463 | dispatch_queued_ios(m); | 464 | dispatch_queued_ios(m); |
| 464 | } | 465 | } |
| @@ -553,6 +554,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, | |||
| 553 | return -EINVAL; | 554 | return -EINVAL; |
| 554 | } | 555 | } |
| 555 | 556 | ||
| 557 | if (ps_argc > as->argc) { | ||
| 558 | dm_put_path_selector(pst); | ||
| 559 | ti->error = "not enough arguments for path selector"; | ||
| 560 | return -EINVAL; | ||
| 561 | } | ||
| 562 | |||
| 556 | r = pst->create(&pg->ps, ps_argc, as->argv); | 563 | r = pst->create(&pg->ps, ps_argc, as->argv); |
| 557 | if (r) { | 564 | if (r) { |
| 558 | dm_put_path_selector(pst); | 565 | dm_put_path_selector(pst); |
| @@ -591,9 +598,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, | |||
| 591 | } | 598 | } |
| 592 | 599 | ||
| 593 | if (m->hw_handler_name) { | 600 | if (m->hw_handler_name) { |
| 594 | r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev), | 601 | struct request_queue *q = bdev_get_queue(p->path.dev->bdev); |
| 595 | m->hw_handler_name); | 602 | |
| 603 | r = scsi_dh_attach(q, m->hw_handler_name); | ||
| 604 | if (r == -EBUSY) { | ||
| 605 | /* | ||
| 606 | * Already attached to different hw_handler, | ||
| 607 | * try to reattach with correct one. | ||
| 608 | */ | ||
| 609 | scsi_dh_detach(q); | ||
| 610 | r = scsi_dh_attach(q, m->hw_handler_name); | ||
| 611 | } | ||
| 612 | |||
| 596 | if (r < 0) { | 613 | if (r < 0) { |
| 614 | ti->error = "error attaching hardware handler"; | ||
| 597 | dm_put_device(ti, p->path.dev); | 615 | dm_put_device(ti, p->path.dev); |
| 598 | goto bad; | 616 | goto bad; |
| 599 | } | 617 | } |
| @@ -699,6 +717,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) | |||
| 699 | if (!hw_argc) | 717 | if (!hw_argc) |
| 700 | return 0; | 718 | return 0; |
| 701 | 719 | ||
| 720 | if (hw_argc > as->argc) { | ||
| 721 | ti->error = "not enough arguments for hardware handler"; | ||
| 722 | return -EINVAL; | ||
| 723 | } | ||
| 724 | |||
| 702 | m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); | 725 | m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); |
| 703 | request_module("scsi_dh_%s", m->hw_handler_name); | 726 | request_module("scsi_dh_%s", m->hw_handler_name); |
| 704 | if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { | 727 | if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { |
| @@ -823,6 +846,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, | |||
| 823 | goto bad; | 846 | goto bad; |
| 824 | } | 847 | } |
| 825 | 848 | ||
| 849 | ti->num_flush_requests = 1; | ||
| 850 | |||
| 826 | return 0; | 851 | return 0; |
| 827 | 852 | ||
| 828 | bad: | 853 | bad: |
| @@ -836,25 +861,29 @@ static void multipath_dtr(struct dm_target *ti) | |||
| 836 | 861 | ||
| 837 | flush_workqueue(kmpath_handlerd); | 862 | flush_workqueue(kmpath_handlerd); |
| 838 | flush_workqueue(kmultipathd); | 863 | flush_workqueue(kmultipathd); |
| 864 | flush_scheduled_work(); | ||
| 839 | free_multipath(m); | 865 | free_multipath(m); |
| 840 | } | 866 | } |
| 841 | 867 | ||
| 842 | /* | 868 | /* |
| 843 | * Map bios, recording original fields for later in case we have to resubmit | 869 | * Map cloned requests |
| 844 | */ | 870 | */ |
| 845 | static int multipath_map(struct dm_target *ti, struct bio *bio, | 871 | static int multipath_map(struct dm_target *ti, struct request *clone, |
| 846 | union map_info *map_context) | 872 | union map_info *map_context) |
| 847 | { | 873 | { |
| 848 | int r; | 874 | int r; |
| 849 | struct dm_mpath_io *mpio; | 875 | struct dm_mpath_io *mpio; |
| 850 | struct multipath *m = (struct multipath *) ti->private; | 876 | struct multipath *m = (struct multipath *) ti->private; |
| 851 | 877 | ||
| 852 | mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); | 878 | mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); |
| 853 | dm_bio_record(&mpio->details, bio); | 879 | if (!mpio) |
| 880 | /* ENOMEM, requeue */ | ||
| 881 | return DM_MAPIO_REQUEUE; | ||
| 882 | memset(mpio, 0, sizeof(*mpio)); | ||
| 854 | 883 | ||
| 855 | map_context->ptr = mpio; | 884 | map_context->ptr = mpio; |
| 856 | bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); | 885 | clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; |
| 857 | r = map_io(m, bio, mpio, 0); | 886 | r = map_io(m, clone, mpio, 0); |
| 858 | if (r < 0 || r == DM_MAPIO_REQUEUE) | 887 | if (r < 0 || r == DM_MAPIO_REQUEUE) |
| 859 | mempool_free(mpio, m->mpio_pool); | 888 | mempool_free(mpio, m->mpio_pool); |
| 860 | 889 | ||
| @@ -924,9 +953,13 @@ static int reinstate_path(struct pgpath *pgpath) | |||
| 924 | 953 | ||
| 925 | pgpath->is_active = 1; | 954 | pgpath->is_active = 1; |
| 926 | 955 | ||
| 927 | m->current_pgpath = NULL; | 956 | if (!m->nr_valid_paths++ && m->queue_size) { |
| 928 | if (!m->nr_valid_paths++ && m->queue_size) | 957 | m->current_pgpath = NULL; |
| 929 | queue_work(kmultipathd, &m->process_queued_ios); | 958 | queue_work(kmultipathd, &m->process_queued_ios); |
| 959 | } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { | ||
| 960 | if (queue_work(kmpath_handlerd, &pgpath->activate_path)) | ||
| 961 | m->pg_init_in_progress++; | ||
| 962 | } | ||
| 930 | 963 | ||
| 931 | dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, | 964 | dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, |
| 932 | pgpath->path.dev->name, m->nr_valid_paths); | 965 | pgpath->path.dev->name, m->nr_valid_paths); |
| @@ -1102,87 +1135,70 @@ static void pg_init_done(struct dm_path *path, int errors) | |||
| 1102 | 1135 | ||
| 1103 | spin_lock_irqsave(&m->lock, flags); | 1136 | spin_lock_irqsave(&m->lock, flags); |
| 1104 | if (errors) { | 1137 | if (errors) { |
| 1105 | DMERR("Could not failover device. Error %d.", errors); | 1138 | if (pgpath == m->current_pgpath) { |
| 1106 | m->current_pgpath = NULL; | 1139 | DMERR("Could not failover device. Error %d.", errors); |
| 1107 | m->current_pg = NULL; | 1140 | m->current_pgpath = NULL; |
| 1141 | m->current_pg = NULL; | ||
| 1142 | } | ||
| 1108 | } else if (!m->pg_init_required) { | 1143 | } else if (!m->pg_init_required) { |
| 1109 | m->queue_io = 0; | 1144 | m->queue_io = 0; |
| 1110 | pg->bypassed = 0; | 1145 | pg->bypassed = 0; |
| 1111 | } | 1146 | } |
| 1112 | 1147 | ||
| 1113 | m->pg_init_in_progress = 0; | 1148 | m->pg_init_in_progress--; |
| 1114 | queue_work(kmultipathd, &m->process_queued_ios); | 1149 | if (!m->pg_init_in_progress) |
| 1150 | queue_work(kmultipathd, &m->process_queued_ios); | ||
| 1115 | spin_unlock_irqrestore(&m->lock, flags); | 1151 | spin_unlock_irqrestore(&m->lock, flags); |
| 1116 | } | 1152 | } |
| 1117 | 1153 | ||
| 1118 | static void activate_path(struct work_struct *work) | 1154 | static void activate_path(struct work_struct *work) |
| 1119 | { | 1155 | { |
| 1120 | int ret; | 1156 | int ret; |
| 1121 | struct multipath *m = | 1157 | struct pgpath *pgpath = |
| 1122 | container_of(work, struct multipath, activate_path); | 1158 | container_of(work, struct pgpath, activate_path); |
| 1123 | struct dm_path *path; | ||
| 1124 | unsigned long flags; | ||
| 1125 | 1159 | ||
| 1126 | spin_lock_irqsave(&m->lock, flags); | 1160 | ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev)); |
| 1127 | path = &m->pgpath_to_activate->path; | 1161 | pg_init_done(&pgpath->path, ret); |
| 1128 | m->pgpath_to_activate = NULL; | ||
| 1129 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 1130 | if (!path) | ||
| 1131 | return; | ||
| 1132 | ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev)); | ||
| 1133 | pg_init_done(path, ret); | ||
| 1134 | } | 1162 | } |
| 1135 | 1163 | ||
| 1136 | /* | 1164 | /* |
| 1137 | * end_io handling | 1165 | * end_io handling |
| 1138 | */ | 1166 | */ |
| 1139 | static int do_end_io(struct multipath *m, struct bio *bio, | 1167 | static int do_end_io(struct multipath *m, struct request *clone, |
| 1140 | int error, struct dm_mpath_io *mpio) | 1168 | int error, struct dm_mpath_io *mpio) |
| 1141 | { | 1169 | { |
| 1170 | /* | ||
| 1171 | * We don't queue any clone request inside the multipath target | ||
| 1172 | * during end I/O handling, since those clone requests don't have | ||
| 1173 | * bio clones. If we queue them inside the multipath target, | ||
| 1174 | * we need to make bio clones, that requires memory allocation. | ||
| 1175 | * (See drivers/md/dm.c:end_clone_bio() about why the clone requests | ||
| 1176 | * don't have bio clones.) | ||
| 1177 | * Instead of queueing the clone request here, we queue the original | ||
| 1178 | * request into dm core, which will remake a clone request and | ||
| 1179 | * clone bios for it and resubmit it later. | ||
| 1180 | */ | ||
| 1181 | int r = DM_ENDIO_REQUEUE; | ||
| 1142 | unsigned long flags; | 1182 | unsigned long flags; |
| 1143 | 1183 | ||
| 1144 | if (!error) | 1184 | if (!error && !clone->errors) |
| 1145 | return 0; /* I/O complete */ | 1185 | return 0; /* I/O complete */ |
| 1146 | 1186 | ||
| 1147 | if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) | ||
| 1148 | return error; | ||
| 1149 | |||
| 1150 | if (error == -EOPNOTSUPP) | 1187 | if (error == -EOPNOTSUPP) |
| 1151 | return error; | 1188 | return error; |
| 1152 | 1189 | ||
| 1153 | spin_lock_irqsave(&m->lock, flags); | ||
| 1154 | if (!m->nr_valid_paths) { | ||
| 1155 | if (__must_push_back(m)) { | ||
| 1156 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 1157 | return DM_ENDIO_REQUEUE; | ||
| 1158 | } else if (!m->queue_if_no_path) { | ||
| 1159 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 1160 | return -EIO; | ||
| 1161 | } else { | ||
| 1162 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 1163 | goto requeue; | ||
| 1164 | } | ||
| 1165 | } | ||
| 1166 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 1167 | |||
| 1168 | if (mpio->pgpath) | 1190 | if (mpio->pgpath) |
| 1169 | fail_path(mpio->pgpath); | 1191 | fail_path(mpio->pgpath); |
| 1170 | 1192 | ||
| 1171 | requeue: | ||
| 1172 | dm_bio_restore(&mpio->details, bio); | ||
| 1173 | |||
| 1174 | /* queue for the daemon to resubmit or fail */ | ||
| 1175 | spin_lock_irqsave(&m->lock, flags); | 1193 | spin_lock_irqsave(&m->lock, flags); |
| 1176 | bio_list_add(&m->queued_ios, bio); | 1194 | if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m)) |
| 1177 | m->queue_size++; | 1195 | r = -EIO; |
| 1178 | if (!m->queue_io) | ||
| 1179 | queue_work(kmultipathd, &m->process_queued_ios); | ||
| 1180 | spin_unlock_irqrestore(&m->lock, flags); | 1196 | spin_unlock_irqrestore(&m->lock, flags); |
| 1181 | 1197 | ||
| 1182 | return DM_ENDIO_INCOMPLETE; /* io not complete */ | 1198 | return r; |
| 1183 | } | 1199 | } |
| 1184 | 1200 | ||
| 1185 | static int multipath_end_io(struct dm_target *ti, struct bio *bio, | 1201 | static int multipath_end_io(struct dm_target *ti, struct request *clone, |
| 1186 | int error, union map_info *map_context) | 1202 | int error, union map_info *map_context) |
| 1187 | { | 1203 | { |
| 1188 | struct multipath *m = ti->private; | 1204 | struct multipath *m = ti->private; |
| @@ -1191,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio, | |||
| 1191 | struct path_selector *ps; | 1207 | struct path_selector *ps; |
| 1192 | int r; | 1208 | int r; |
| 1193 | 1209 | ||
| 1194 | r = do_end_io(m, bio, error, mpio); | 1210 | r = do_end_io(m, clone, error, mpio); |
| 1195 | if (pgpath) { | 1211 | if (pgpath) { |
| 1196 | ps = &pgpath->pg->ps; | 1212 | ps = &pgpath->pg->ps; |
| 1197 | if (ps->type->end_io) | 1213 | if (ps->type->end_io) |
| 1198 | ps->type->end_io(ps, &pgpath->path); | 1214 | ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); |
| 1199 | } | 1215 | } |
| 1200 | if (r != DM_ENDIO_INCOMPLETE) | 1216 | mempool_free(mpio, m->mpio_pool); |
| 1201 | mempool_free(mpio, m->mpio_pool); | ||
| 1202 | 1217 | ||
| 1203 | return r; | 1218 | return r; |
| 1204 | } | 1219 | } |
| @@ -1411,7 +1426,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | |||
| 1411 | spin_lock_irqsave(&m->lock, flags); | 1426 | spin_lock_irqsave(&m->lock, flags); |
| 1412 | 1427 | ||
| 1413 | if (!m->current_pgpath) | 1428 | if (!m->current_pgpath) |
| 1414 | __choose_pgpath(m); | 1429 | __choose_pgpath(m, 0); |
| 1415 | 1430 | ||
| 1416 | if (m->current_pgpath) { | 1431 | if (m->current_pgpath) { |
| 1417 | bdev = m->current_pgpath->path.dev->bdev; | 1432 | bdev = m->current_pgpath->path.dev->bdev; |
| @@ -1428,22 +1443,113 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | |||
| 1428 | return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); | 1443 | return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); |
| 1429 | } | 1444 | } |
| 1430 | 1445 | ||
| 1446 | static int multipath_iterate_devices(struct dm_target *ti, | ||
| 1447 | iterate_devices_callout_fn fn, void *data) | ||
| 1448 | { | ||
| 1449 | struct multipath *m = ti->private; | ||
| 1450 | struct priority_group *pg; | ||
| 1451 | struct pgpath *p; | ||
| 1452 | int ret = 0; | ||
| 1453 | |||
| 1454 | list_for_each_entry(pg, &m->priority_groups, list) { | ||
| 1455 | list_for_each_entry(p, &pg->pgpaths, list) { | ||
| 1456 | ret = fn(ti, p->path.dev, ti->begin, data); | ||
| 1457 | if (ret) | ||
| 1458 | goto out; | ||
| 1459 | } | ||
| 1460 | } | ||
| 1461 | |||
| 1462 | out: | ||
| 1463 | return ret; | ||
| 1464 | } | ||
| 1465 | |||
| 1466 | static int __pgpath_busy(struct pgpath *pgpath) | ||
| 1467 | { | ||
| 1468 | struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); | ||
| 1469 | |||
| 1470 | return dm_underlying_device_busy(q); | ||
| 1471 | } | ||
| 1472 | |||
| 1473 | /* | ||
| 1474 | * We return "busy", only when we can map I/Os but underlying devices | ||
| 1475 | * are busy (so even if we map I/Os now, the I/Os will wait on | ||
| 1476 | * the underlying queue). | ||
| 1477 | * In other words, if we want to kill I/Os or queue them inside us | ||
| 1478 | * due to map unavailability, we don't return "busy". Otherwise, | ||
| 1479 | * dm core won't give us the I/Os and we can't do what we want. | ||
| 1480 | */ | ||
| 1481 | static int multipath_busy(struct dm_target *ti) | ||
| 1482 | { | ||
| 1483 | int busy = 0, has_active = 0; | ||
| 1484 | struct multipath *m = ti->private; | ||
| 1485 | struct priority_group *pg; | ||
| 1486 | struct pgpath *pgpath; | ||
| 1487 | unsigned long flags; | ||
| 1488 | |||
| 1489 | spin_lock_irqsave(&m->lock, flags); | ||
| 1490 | |||
| 1491 | /* Guess which priority_group will be used at next mapping time */ | ||
| 1492 | if (unlikely(!m->current_pgpath && m->next_pg)) | ||
| 1493 | pg = m->next_pg; | ||
| 1494 | else if (likely(m->current_pg)) | ||
| 1495 | pg = m->current_pg; | ||
| 1496 | else | ||
| 1497 | /* | ||
| 1498 | * We don't know which pg will be used at next mapping time. | ||
| 1499 | * We don't call __choose_pgpath() here to avoid to trigger | ||
| 1500 | * pg_init just by busy checking. | ||
| 1501 | * So we don't know whether underlying devices we will be using | ||
| 1502 | * at next mapping time are busy or not. Just try mapping. | ||
| 1503 | */ | ||
| 1504 | goto out; | ||
| 1505 | |||
| 1506 | /* | ||
| 1507 | * If there is one non-busy active path at least, the path selector | ||
| 1508 | * will be able to select it. So we consider such a pg as not busy. | ||
| 1509 | */ | ||
| 1510 | busy = 1; | ||
| 1511 | list_for_each_entry(pgpath, &pg->pgpaths, list) | ||
| 1512 | if (pgpath->is_active) { | ||
| 1513 | has_active = 1; | ||
| 1514 | |||
| 1515 | if (!__pgpath_busy(pgpath)) { | ||
| 1516 | busy = 0; | ||
| 1517 | break; | ||
| 1518 | } | ||
| 1519 | } | ||
| 1520 | |||
| 1521 | if (!has_active) | ||
| 1522 | /* | ||
| 1523 | * No active path in this pg, so this pg won't be used and | ||
| 1524 | * the current_pg will be changed at next mapping time. | ||
| 1525 | * We need to try mapping to determine it. | ||
| 1526 | */ | ||
| 1527 | busy = 0; | ||
| 1528 | |||
| 1529 | out: | ||
| 1530 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 1531 | |||
| 1532 | return busy; | ||
| 1533 | } | ||
| 1534 | |||
| 1431 | /*----------------------------------------------------------------- | 1535 | /*----------------------------------------------------------------- |
| 1432 | * Module setup | 1536 | * Module setup |
| 1433 | *---------------------------------------------------------------*/ | 1537 | *---------------------------------------------------------------*/ |
| 1434 | static struct target_type multipath_target = { | 1538 | static struct target_type multipath_target = { |
| 1435 | .name = "multipath", | 1539 | .name = "multipath", |
| 1436 | .version = {1, 0, 5}, | 1540 | .version = {1, 1, 0}, |
| 1437 | .module = THIS_MODULE, | 1541 | .module = THIS_MODULE, |
| 1438 | .ctr = multipath_ctr, | 1542 | .ctr = multipath_ctr, |
| 1439 | .dtr = multipath_dtr, | 1543 | .dtr = multipath_dtr, |
| 1440 | .map = multipath_map, | 1544 | .map_rq = multipath_map, |
| 1441 | .end_io = multipath_end_io, | 1545 | .rq_end_io = multipath_end_io, |
| 1442 | .presuspend = multipath_presuspend, | 1546 | .presuspend = multipath_presuspend, |
| 1443 | .resume = multipath_resume, | 1547 | .resume = multipath_resume, |
| 1444 | .status = multipath_status, | 1548 | .status = multipath_status, |
| 1445 | .message = multipath_message, | 1549 | .message = multipath_message, |
| 1446 | .ioctl = multipath_ioctl, | 1550 | .ioctl = multipath_ioctl, |
| 1551 | .iterate_devices = multipath_iterate_devices, | ||
| 1552 | .busy = multipath_busy, | ||
| 1447 | }; | 1553 | }; |
| 1448 | 1554 | ||
| 1449 | static int __init dm_multipath_init(void) | 1555 | static int __init dm_multipath_init(void) |
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 27357b85d73d..e7d1fa8b0459 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h | |||
| @@ -56,7 +56,8 @@ struct path_selector_type { | |||
| 56 | * the path fails. | 56 | * the path fails. |
| 57 | */ | 57 | */ |
| 58 | struct dm_path *(*select_path) (struct path_selector *ps, | 58 | struct dm_path *(*select_path) (struct path_selector *ps, |
| 59 | unsigned *repeat_count); | 59 | unsigned *repeat_count, |
| 60 | size_t nr_bytes); | ||
| 60 | 61 | ||
| 61 | /* | 62 | /* |
| 62 | * Notify the selector that a path has failed. | 63 | * Notify the selector that a path has failed. |
| @@ -75,7 +76,10 @@ struct path_selector_type { | |||
| 75 | int (*status) (struct path_selector *ps, struct dm_path *path, | 76 | int (*status) (struct path_selector *ps, struct dm_path *path, |
| 76 | status_type_t type, char *result, unsigned int maxlen); | 77 | status_type_t type, char *result, unsigned int maxlen); |
| 77 | 78 | ||
| 78 | int (*end_io) (struct path_selector *ps, struct dm_path *path); | 79 | int (*start_io) (struct path_selector *ps, struct dm_path *path, |
| 80 | size_t nr_bytes); | ||
| 81 | int (*end_io) (struct path_selector *ps, struct dm_path *path, | ||
| 82 | size_t nr_bytes); | ||
| 79 | }; | 83 | }; |
| 80 | 84 | ||
| 81 | /* Register a path selector */ | 85 | /* Register a path selector */ |
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c new file mode 100644 index 000000000000..f92b6cea9d9c --- /dev/null +++ b/drivers/md/dm-queue-length.c | |||
| @@ -0,0 +1,263 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved. | ||
| 3 | * Copyright (C) 2006-2009 NEC Corporation. | ||
| 4 | * | ||
| 5 | * dm-queue-length.c | ||
| 6 | * | ||
| 7 | * Module Author: Stefan Bader, IBM | ||
| 8 | * Modified by: Kiyoshi Ueda, NEC | ||
| 9 | * | ||
| 10 | * This file is released under the GPL. | ||
| 11 | * | ||
| 12 | * queue-length path selector - choose a path with the least number of | ||
| 13 | * in-flight I/Os. | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include "dm.h" | ||
| 17 | #include "dm-path-selector.h" | ||
| 18 | |||
| 19 | #include <linux/slab.h> | ||
| 20 | #include <linux/ctype.h> | ||
| 21 | #include <linux/errno.h> | ||
| 22 | #include <linux/module.h> | ||
| 23 | #include <asm/atomic.h> | ||
| 24 | |||
| 25 | #define DM_MSG_PREFIX "multipath queue-length" | ||
| 26 | #define QL_MIN_IO 128 | ||
| 27 | #define QL_VERSION "0.1.0" | ||
| 28 | |||
| 29 | struct selector { | ||
| 30 | struct list_head valid_paths; | ||
| 31 | struct list_head failed_paths; | ||
| 32 | }; | ||
| 33 | |||
| 34 | struct path_info { | ||
| 35 | struct list_head list; | ||
| 36 | struct dm_path *path; | ||
| 37 | unsigned repeat_count; | ||
| 38 | atomic_t qlen; /* the number of in-flight I/Os */ | ||
| 39 | }; | ||
| 40 | |||
| 41 | static struct selector *alloc_selector(void) | ||
| 42 | { | ||
| 43 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
| 44 | |||
| 45 | if (s) { | ||
| 46 | INIT_LIST_HEAD(&s->valid_paths); | ||
| 47 | INIT_LIST_HEAD(&s->failed_paths); | ||
| 48 | } | ||
| 49 | |||
| 50 | return s; | ||
| 51 | } | ||
| 52 | |||
| 53 | static int ql_create(struct path_selector *ps, unsigned argc, char **argv) | ||
| 54 | { | ||
| 55 | struct selector *s = alloc_selector(); | ||
| 56 | |||
| 57 | if (!s) | ||
| 58 | return -ENOMEM; | ||
| 59 | |||
| 60 | ps->context = s; | ||
| 61 | return 0; | ||
| 62 | } | ||
| 63 | |||
| 64 | static void ql_free_paths(struct list_head *paths) | ||
| 65 | { | ||
| 66 | struct path_info *pi, *next; | ||
| 67 | |||
| 68 | list_for_each_entry_safe(pi, next, paths, list) { | ||
| 69 | list_del(&pi->list); | ||
| 70 | kfree(pi); | ||
| 71 | } | ||
| 72 | } | ||
| 73 | |||
| 74 | static void ql_destroy(struct path_selector *ps) | ||
| 75 | { | ||
| 76 | struct selector *s = ps->context; | ||
| 77 | |||
| 78 | ql_free_paths(&s->valid_paths); | ||
| 79 | ql_free_paths(&s->failed_paths); | ||
| 80 | kfree(s); | ||
| 81 | ps->context = NULL; | ||
| 82 | } | ||
| 83 | |||
| 84 | static int ql_status(struct path_selector *ps, struct dm_path *path, | ||
| 85 | status_type_t type, char *result, unsigned maxlen) | ||
| 86 | { | ||
| 87 | unsigned sz = 0; | ||
| 88 | struct path_info *pi; | ||
| 89 | |||
| 90 | /* When called with NULL path, return selector status/args. */ | ||
| 91 | if (!path) | ||
| 92 | DMEMIT("0 "); | ||
| 93 | else { | ||
| 94 | pi = path->pscontext; | ||
| 95 | |||
| 96 | switch (type) { | ||
| 97 | case STATUSTYPE_INFO: | ||
| 98 | DMEMIT("%d ", atomic_read(&pi->qlen)); | ||
| 99 | break; | ||
| 100 | case STATUSTYPE_TABLE: | ||
| 101 | DMEMIT("%u ", pi->repeat_count); | ||
| 102 | break; | ||
| 103 | } | ||
| 104 | } | ||
| 105 | |||
| 106 | return sz; | ||
| 107 | } | ||
| 108 | |||
| 109 | static int ql_add_path(struct path_selector *ps, struct dm_path *path, | ||
| 110 | int argc, char **argv, char **error) | ||
| 111 | { | ||
| 112 | struct selector *s = ps->context; | ||
| 113 | struct path_info *pi; | ||
| 114 | unsigned repeat_count = QL_MIN_IO; | ||
| 115 | |||
| 116 | /* | ||
| 117 | * Arguments: [<repeat_count>] | ||
| 118 | * <repeat_count>: The number of I/Os before switching path. | ||
| 119 | * If not given, default (QL_MIN_IO) is used. | ||
| 120 | */ | ||
| 121 | if (argc > 1) { | ||
| 122 | *error = "queue-length ps: incorrect number of arguments"; | ||
| 123 | return -EINVAL; | ||
| 124 | } | ||
| 125 | |||
| 126 | if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | ||
| 127 | *error = "queue-length ps: invalid repeat count"; | ||
| 128 | return -EINVAL; | ||
| 129 | } | ||
| 130 | |||
| 131 | /* Allocate the path information structure */ | ||
| 132 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | ||
| 133 | if (!pi) { | ||
| 134 | *error = "queue-length ps: Error allocating path information"; | ||
| 135 | return -ENOMEM; | ||
| 136 | } | ||
| 137 | |||
| 138 | pi->path = path; | ||
| 139 | pi->repeat_count = repeat_count; | ||
| 140 | atomic_set(&pi->qlen, 0); | ||
| 141 | |||
| 142 | path->pscontext = pi; | ||
| 143 | |||
| 144 | list_add_tail(&pi->list, &s->valid_paths); | ||
| 145 | |||
| 146 | return 0; | ||
| 147 | } | ||
| 148 | |||
| 149 | static void ql_fail_path(struct path_selector *ps, struct dm_path *path) | ||
| 150 | { | ||
| 151 | struct selector *s = ps->context; | ||
| 152 | struct path_info *pi = path->pscontext; | ||
| 153 | |||
| 154 | list_move(&pi->list, &s->failed_paths); | ||
| 155 | } | ||
| 156 | |||
| 157 | static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) | ||
| 158 | { | ||
| 159 | struct selector *s = ps->context; | ||
| 160 | struct path_info *pi = path->pscontext; | ||
| 161 | |||
| 162 | list_move_tail(&pi->list, &s->valid_paths); | ||
| 163 | |||
| 164 | return 0; | ||
| 165 | } | ||
| 166 | |||
| 167 | /* | ||
| 168 | * Select a path having the minimum number of in-flight I/Os | ||
| 169 | */ | ||
| 170 | static struct dm_path *ql_select_path(struct path_selector *ps, | ||
| 171 | unsigned *repeat_count, size_t nr_bytes) | ||
| 172 | { | ||
| 173 | struct selector *s = ps->context; | ||
| 174 | struct path_info *pi = NULL, *best = NULL; | ||
| 175 | |||
| 176 | if (list_empty(&s->valid_paths)) | ||
| 177 | return NULL; | ||
| 178 | |||
| 179 | /* Change preferred (first in list) path to evenly balance. */ | ||
| 180 | list_move_tail(s->valid_paths.next, &s->valid_paths); | ||
| 181 | |||
| 182 | list_for_each_entry(pi, &s->valid_paths, list) { | ||
| 183 | if (!best || | ||
| 184 | (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) | ||
| 185 | best = pi; | ||
| 186 | |||
| 187 | if (!atomic_read(&best->qlen)) | ||
| 188 | break; | ||
| 189 | } | ||
| 190 | |||
| 191 | if (!best) | ||
| 192 | return NULL; | ||
| 193 | |||
| 194 | *repeat_count = best->repeat_count; | ||
| 195 | |||
| 196 | return best->path; | ||
| 197 | } | ||
| 198 | |||
| 199 | static int ql_start_io(struct path_selector *ps, struct dm_path *path, | ||
| 200 | size_t nr_bytes) | ||
| 201 | { | ||
| 202 | struct path_info *pi = path->pscontext; | ||
| 203 | |||
| 204 | atomic_inc(&pi->qlen); | ||
| 205 | |||
| 206 | return 0; | ||
| 207 | } | ||
| 208 | |||
| 209 | static int ql_end_io(struct path_selector *ps, struct dm_path *path, | ||
| 210 | size_t nr_bytes) | ||
| 211 | { | ||
| 212 | struct path_info *pi = path->pscontext; | ||
| 213 | |||
| 214 | atomic_dec(&pi->qlen); | ||
| 215 | |||
| 216 | return 0; | ||
| 217 | } | ||
| 218 | |||
| 219 | static struct path_selector_type ql_ps = { | ||
| 220 | .name = "queue-length", | ||
| 221 | .module = THIS_MODULE, | ||
| 222 | .table_args = 1, | ||
| 223 | .info_args = 1, | ||
| 224 | .create = ql_create, | ||
| 225 | .destroy = ql_destroy, | ||
| 226 | .status = ql_status, | ||
| 227 | .add_path = ql_add_path, | ||
| 228 | .fail_path = ql_fail_path, | ||
| 229 | .reinstate_path = ql_reinstate_path, | ||
| 230 | .select_path = ql_select_path, | ||
| 231 | .start_io = ql_start_io, | ||
| 232 | .end_io = ql_end_io, | ||
| 233 | }; | ||
| 234 | |||
| 235 | static int __init dm_ql_init(void) | ||
| 236 | { | ||
| 237 | int r = dm_register_path_selector(&ql_ps); | ||
| 238 | |||
| 239 | if (r < 0) | ||
| 240 | DMERR("register failed %d", r); | ||
| 241 | |||
| 242 | DMINFO("version " QL_VERSION " loaded"); | ||
| 243 | |||
| 244 | return r; | ||
| 245 | } | ||
| 246 | |||
| 247 | static void __exit dm_ql_exit(void) | ||
| 248 | { | ||
| 249 | int r = dm_unregister_path_selector(&ql_ps); | ||
| 250 | |||
| 251 | if (r < 0) | ||
| 252 | DMERR("unregister failed %d", r); | ||
| 253 | } | ||
| 254 | |||
| 255 | module_init(dm_ql_init); | ||
| 256 | module_exit(dm_ql_exit); | ||
| 257 | |||
| 258 | MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>"); | ||
| 259 | MODULE_DESCRIPTION( | ||
| 260 | "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n" | ||
| 261 | DM_NAME " path selector to balance the number of in-flight I/Os" | ||
| 262 | ); | ||
| 263 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 076fbb4e967a..ce8868c768cc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
| @@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type, | |||
| 1283 | return 0; | 1283 | return 0; |
| 1284 | } | 1284 | } |
| 1285 | 1285 | ||
| 1286 | static int mirror_iterate_devices(struct dm_target *ti, | ||
| 1287 | iterate_devices_callout_fn fn, void *data) | ||
| 1288 | { | ||
| 1289 | struct mirror_set *ms = ti->private; | ||
| 1290 | int ret = 0; | ||
| 1291 | unsigned i; | ||
| 1292 | |||
| 1293 | for (i = 0; !ret && i < ms->nr_mirrors; i++) | ||
| 1294 | ret = fn(ti, ms->mirror[i].dev, | ||
| 1295 | ms->mirror[i].offset, data); | ||
| 1296 | |||
| 1297 | return ret; | ||
| 1298 | } | ||
| 1299 | |||
| 1286 | static struct target_type mirror_target = { | 1300 | static struct target_type mirror_target = { |
| 1287 | .name = "mirror", | 1301 | .name = "mirror", |
| 1288 | .version = {1, 0, 20}, | 1302 | .version = {1, 12, 0}, |
| 1289 | .module = THIS_MODULE, | 1303 | .module = THIS_MODULE, |
| 1290 | .ctr = mirror_ctr, | 1304 | .ctr = mirror_ctr, |
| 1291 | .dtr = mirror_dtr, | 1305 | .dtr = mirror_dtr, |
| @@ -1295,6 +1309,7 @@ static struct target_type mirror_target = { | |||
| 1295 | .postsuspend = mirror_postsuspend, | 1309 | .postsuspend = mirror_postsuspend, |
| 1296 | .resume = mirror_resume, | 1310 | .resume = mirror_resume, |
| 1297 | .status = mirror_status, | 1311 | .status = mirror_status, |
| 1312 | .iterate_devices = mirror_iterate_devices, | ||
| 1298 | }; | 1313 | }; |
| 1299 | 1314 | ||
| 1300 | static int __init dm_mirror_init(void) | 1315 | static int __init dm_mirror_init(void) |
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 7b899be0b087..36dbe29f2fd6 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c | |||
| @@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) | |||
| 283 | 283 | ||
| 284 | nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); | 284 | nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); |
| 285 | if (unlikely(!nreg)) | 285 | if (unlikely(!nreg)) |
| 286 | nreg = kmalloc(sizeof(*nreg), GFP_NOIO); | 286 | nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); |
| 287 | 287 | ||
| 288 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? | 288 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? |
| 289 | DM_RH_CLEAN : DM_RH_NOSYNC; | 289 | DM_RH_CLEAN : DM_RH_NOSYNC; |
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index cdfbf65b28cb..24752f449bef 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c | |||
| @@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) | |||
| 161 | } | 161 | } |
| 162 | 162 | ||
| 163 | static struct dm_path *rr_select_path(struct path_selector *ps, | 163 | static struct dm_path *rr_select_path(struct path_selector *ps, |
| 164 | unsigned *repeat_count) | 164 | unsigned *repeat_count, size_t nr_bytes) |
| 165 | { | 165 | { |
| 166 | struct selector *s = (struct selector *) ps->context; | 166 | struct selector *s = (struct selector *) ps->context; |
| 167 | struct path_info *pi = NULL; | 167 | struct path_info *pi = NULL; |
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c new file mode 100644 index 000000000000..cfa668f46c40 --- /dev/null +++ b/drivers/md/dm-service-time.c | |||
| @@ -0,0 +1,339 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * Module Author: Kiyoshi Ueda | ||
| 5 | * | ||
| 6 | * This file is released under the GPL. | ||
| 7 | * | ||
| 8 | * Throughput oriented path selector. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include "dm.h" | ||
| 12 | #include "dm-path-selector.h" | ||
| 13 | |||
| 14 | #define DM_MSG_PREFIX "multipath service-time" | ||
| 15 | #define ST_MIN_IO 1 | ||
| 16 | #define ST_MAX_RELATIVE_THROUGHPUT 100 | ||
| 17 | #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 | ||
| 18 | #define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) | ||
| 19 | #define ST_VERSION "0.2.0" | ||
| 20 | |||
| 21 | struct selector { | ||
| 22 | struct list_head valid_paths; | ||
| 23 | struct list_head failed_paths; | ||
| 24 | }; | ||
| 25 | |||
| 26 | struct path_info { | ||
| 27 | struct list_head list; | ||
| 28 | struct dm_path *path; | ||
| 29 | unsigned repeat_count; | ||
| 30 | unsigned relative_throughput; | ||
| 31 | atomic_t in_flight_size; /* Total size of in-flight I/Os */ | ||
| 32 | }; | ||
| 33 | |||
| 34 | static struct selector *alloc_selector(void) | ||
| 35 | { | ||
| 36 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
| 37 | |||
| 38 | if (s) { | ||
| 39 | INIT_LIST_HEAD(&s->valid_paths); | ||
| 40 | INIT_LIST_HEAD(&s->failed_paths); | ||
| 41 | } | ||
| 42 | |||
| 43 | return s; | ||
| 44 | } | ||
| 45 | |||
| 46 | static int st_create(struct path_selector *ps, unsigned argc, char **argv) | ||
| 47 | { | ||
| 48 | struct selector *s = alloc_selector(); | ||
| 49 | |||
| 50 | if (!s) | ||
| 51 | return -ENOMEM; | ||
| 52 | |||
| 53 | ps->context = s; | ||
| 54 | return 0; | ||
| 55 | } | ||
| 56 | |||
| 57 | static void free_paths(struct list_head *paths) | ||
| 58 | { | ||
| 59 | struct path_info *pi, *next; | ||
| 60 | |||
| 61 | list_for_each_entry_safe(pi, next, paths, list) { | ||
| 62 | list_del(&pi->list); | ||
| 63 | kfree(pi); | ||
| 64 | } | ||
| 65 | } | ||
| 66 | |||
| 67 | static void st_destroy(struct path_selector *ps) | ||
| 68 | { | ||
| 69 | struct selector *s = ps->context; | ||
| 70 | |||
| 71 | free_paths(&s->valid_paths); | ||
| 72 | free_paths(&s->failed_paths); | ||
| 73 | kfree(s); | ||
| 74 | ps->context = NULL; | ||
| 75 | } | ||
| 76 | |||
| 77 | static int st_status(struct path_selector *ps, struct dm_path *path, | ||
| 78 | status_type_t type, char *result, unsigned maxlen) | ||
| 79 | { | ||
| 80 | unsigned sz = 0; | ||
| 81 | struct path_info *pi; | ||
| 82 | |||
| 83 | if (!path) | ||
| 84 | DMEMIT("0 "); | ||
| 85 | else { | ||
| 86 | pi = path->pscontext; | ||
| 87 | |||
| 88 | switch (type) { | ||
| 89 | case STATUSTYPE_INFO: | ||
| 90 | DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), | ||
| 91 | pi->relative_throughput); | ||
| 92 | break; | ||
| 93 | case STATUSTYPE_TABLE: | ||
| 94 | DMEMIT("%u %u ", pi->repeat_count, | ||
| 95 | pi->relative_throughput); | ||
| 96 | break; | ||
| 97 | } | ||
| 98 | } | ||
| 99 | |||
| 100 | return sz; | ||
| 101 | } | ||
| 102 | |||
| 103 | static int st_add_path(struct path_selector *ps, struct dm_path *path, | ||
| 104 | int argc, char **argv, char **error) | ||
| 105 | { | ||
| 106 | struct selector *s = ps->context; | ||
| 107 | struct path_info *pi; | ||
| 108 | unsigned repeat_count = ST_MIN_IO; | ||
| 109 | unsigned relative_throughput = 1; | ||
| 110 | |||
| 111 | /* | ||
| 112 | * Arguments: [<repeat_count> [<relative_throughput>]] | ||
| 113 | * <repeat_count>: The number of I/Os before switching path. | ||
| 114 | * If not given, default (ST_MIN_IO) is used. | ||
| 115 | * <relative_throughput>: The relative throughput value of | ||
| 116 | * the path among all paths in the path-group. | ||
| 117 | * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> | ||
| 118 | * If not given, minimum value '1' is used. | ||
| 119 | * If '0' is given, the path isn't selected while | ||
| 120 | * other paths having a positive value are | ||
| 121 | * available. | ||
| 122 | */ | ||
| 123 | if (argc > 2) { | ||
| 124 | *error = "service-time ps: incorrect number of arguments"; | ||
| 125 | return -EINVAL; | ||
| 126 | } | ||
| 127 | |||
| 128 | if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | ||
| 129 | *error = "service-time ps: invalid repeat count"; | ||
| 130 | return -EINVAL; | ||
| 131 | } | ||
| 132 | |||
| 133 | if ((argc == 2) && | ||
| 134 | (sscanf(argv[1], "%u", &relative_throughput) != 1 || | ||
| 135 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { | ||
| 136 | *error = "service-time ps: invalid relative_throughput value"; | ||
| 137 | return -EINVAL; | ||
| 138 | } | ||
| 139 | |||
| 140 | /* allocate the path */ | ||
| 141 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | ||
| 142 | if (!pi) { | ||
| 143 | *error = "service-time ps: Error allocating path context"; | ||
| 144 | return -ENOMEM; | ||
| 145 | } | ||
| 146 | |||
| 147 | pi->path = path; | ||
| 148 | pi->repeat_count = repeat_count; | ||
| 149 | pi->relative_throughput = relative_throughput; | ||
| 150 | atomic_set(&pi->in_flight_size, 0); | ||
| 151 | |||
| 152 | path->pscontext = pi; | ||
| 153 | |||
| 154 | list_add_tail(&pi->list, &s->valid_paths); | ||
| 155 | |||
| 156 | return 0; | ||
| 157 | } | ||
| 158 | |||
| 159 | static void st_fail_path(struct path_selector *ps, struct dm_path *path) | ||
| 160 | { | ||
| 161 | struct selector *s = ps->context; | ||
| 162 | struct path_info *pi = path->pscontext; | ||
| 163 | |||
| 164 | list_move(&pi->list, &s->failed_paths); | ||
| 165 | } | ||
| 166 | |||
| 167 | static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) | ||
| 168 | { | ||
| 169 | struct selector *s = ps->context; | ||
| 170 | struct path_info *pi = path->pscontext; | ||
| 171 | |||
| 172 | list_move_tail(&pi->list, &s->valid_paths); | ||
| 173 | |||
| 174 | return 0; | ||
| 175 | } | ||
| 176 | |||
| 177 | /* | ||
| 178 | * Compare the estimated service time of 2 paths, pi1 and pi2, | ||
| 179 | * for the incoming I/O. | ||
| 180 | * | ||
| 181 | * Returns: | ||
| 182 | * < 0 : pi1 is better | ||
| 183 | * 0 : no difference between pi1 and pi2 | ||
| 184 | * > 0 : pi2 is better | ||
| 185 | * | ||
| 186 | * Description: | ||
| 187 | * Basically, the service time is estimated by: | ||
| 188 | * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' | ||
| 189 | * To reduce the calculation, some optimizations are made. | ||
| 190 | * (See comments inline) | ||
| 191 | */ | ||
| 192 | static int st_compare_load(struct path_info *pi1, struct path_info *pi2, | ||
| 193 | size_t incoming) | ||
| 194 | { | ||
| 195 | size_t sz1, sz2, st1, st2; | ||
| 196 | |||
| 197 | sz1 = atomic_read(&pi1->in_flight_size); | ||
| 198 | sz2 = atomic_read(&pi2->in_flight_size); | ||
| 199 | |||
| 200 | /* | ||
| 201 | * Case 1: Both have same throughput value. Choose less loaded path. | ||
| 202 | */ | ||
| 203 | if (pi1->relative_throughput == pi2->relative_throughput) | ||
| 204 | return sz1 - sz2; | ||
| 205 | |||
| 206 | /* | ||
| 207 | * Case 2a: Both have same load. Choose higher throughput path. | ||
| 208 | * Case 2b: One path has no throughput value. Choose the other one. | ||
| 209 | */ | ||
| 210 | if (sz1 == sz2 || | ||
| 211 | !pi1->relative_throughput || !pi2->relative_throughput) | ||
| 212 | return pi2->relative_throughput - pi1->relative_throughput; | ||
| 213 | |||
| 214 | /* | ||
| 215 | * Case 3: Calculate service time. Choose faster path. | ||
| 216 | * Service time using pi1: | ||
| 217 | * st1 = (sz1 + incoming) / pi1->relative_throughput | ||
| 218 | * Service time using pi2: | ||
| 219 | * st2 = (sz2 + incoming) / pi2->relative_throughput | ||
| 220 | * | ||
| 221 | * To avoid the division, transform the expression to use | ||
| 222 | * multiplication. | ||
| 223 | * Because ->relative_throughput > 0 here, if st1 < st2, | ||
| 224 | * the expressions below are the same meaning: | ||
| 225 | * (sz1 + incoming) / pi1->relative_throughput < | ||
| 226 | * (sz2 + incoming) / pi2->relative_throughput | ||
| 227 | * (sz1 + incoming) * pi2->relative_throughput < | ||
| 228 | * (sz2 + incoming) * pi1->relative_throughput | ||
| 229 | * So use the later one. | ||
| 230 | */ | ||
| 231 | sz1 += incoming; | ||
| 232 | sz2 += incoming; | ||
| 233 | if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || | ||
| 234 | sz2 >= ST_MAX_INFLIGHT_SIZE)) { | ||
| 235 | /* | ||
| 236 | * Size may be too big for multiplying pi->relative_throughput | ||
| 237 | * and overflow. | ||
| 238 | * To avoid the overflow and mis-selection, shift down both. | ||
| 239 | */ | ||
| 240 | sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | ||
| 241 | sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | ||
| 242 | } | ||
| 243 | st1 = sz1 * pi2->relative_throughput; | ||
| 244 | st2 = sz2 * pi1->relative_throughput; | ||
| 245 | if (st1 != st2) | ||
| 246 | return st1 - st2; | ||
| 247 | |||
| 248 | /* | ||
| 249 | * Case 4: Service time is equal. Choose higher throughput path. | ||
| 250 | */ | ||
| 251 | return pi2->relative_throughput - pi1->relative_throughput; | ||
| 252 | } | ||
| 253 | |||
| 254 | static struct dm_path *st_select_path(struct path_selector *ps, | ||
| 255 | unsigned *repeat_count, size_t nr_bytes) | ||
| 256 | { | ||
| 257 | struct selector *s = ps->context; | ||
| 258 | struct path_info *pi = NULL, *best = NULL; | ||
| 259 | |||
| 260 | if (list_empty(&s->valid_paths)) | ||
| 261 | return NULL; | ||
| 262 | |||
| 263 | /* Change preferred (first in list) path to evenly balance. */ | ||
| 264 | list_move_tail(s->valid_paths.next, &s->valid_paths); | ||
| 265 | |||
| 266 | list_for_each_entry(pi, &s->valid_paths, list) | ||
| 267 | if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) | ||
| 268 | best = pi; | ||
| 269 | |||
| 270 | if (!best) | ||
| 271 | return NULL; | ||
| 272 | |||
| 273 | *repeat_count = best->repeat_count; | ||
| 274 | |||
| 275 | return best->path; | ||
| 276 | } | ||
| 277 | |||
| 278 | static int st_start_io(struct path_selector *ps, struct dm_path *path, | ||
| 279 | size_t nr_bytes) | ||
| 280 | { | ||
| 281 | struct path_info *pi = path->pscontext; | ||
| 282 | |||
| 283 | atomic_add(nr_bytes, &pi->in_flight_size); | ||
| 284 | |||
| 285 | return 0; | ||
| 286 | } | ||
| 287 | |||
| 288 | static int st_end_io(struct path_selector *ps, struct dm_path *path, | ||
| 289 | size_t nr_bytes) | ||
| 290 | { | ||
| 291 | struct path_info *pi = path->pscontext; | ||
| 292 | |||
| 293 | atomic_sub(nr_bytes, &pi->in_flight_size); | ||
| 294 | |||
| 295 | return 0; | ||
| 296 | } | ||
| 297 | |||
| 298 | static struct path_selector_type st_ps = { | ||
| 299 | .name = "service-time", | ||
| 300 | .module = THIS_MODULE, | ||
| 301 | .table_args = 2, | ||
| 302 | .info_args = 2, | ||
| 303 | .create = st_create, | ||
| 304 | .destroy = st_destroy, | ||
| 305 | .status = st_status, | ||
| 306 | .add_path = st_add_path, | ||
| 307 | .fail_path = st_fail_path, | ||
| 308 | .reinstate_path = st_reinstate_path, | ||
| 309 | .select_path = st_select_path, | ||
| 310 | .start_io = st_start_io, | ||
| 311 | .end_io = st_end_io, | ||
| 312 | }; | ||
| 313 | |||
| 314 | static int __init dm_st_init(void) | ||
| 315 | { | ||
| 316 | int r = dm_register_path_selector(&st_ps); | ||
| 317 | |||
| 318 | if (r < 0) | ||
| 319 | DMERR("register failed %d", r); | ||
| 320 | |||
| 321 | DMINFO("version " ST_VERSION " loaded"); | ||
| 322 | |||
| 323 | return r; | ||
| 324 | } | ||
| 325 | |||
| 326 | static void __exit dm_st_exit(void) | ||
| 327 | { | ||
| 328 | int r = dm_unregister_path_selector(&st_ps); | ||
| 329 | |||
| 330 | if (r < 0) | ||
| 331 | DMERR("unregister failed %d", r); | ||
| 332 | } | ||
| 333 | |||
| 334 | module_init(dm_st_init); | ||
| 335 | module_exit(dm_st_exit); | ||
| 336 | |||
| 337 | MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); | ||
| 338 | MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); | ||
| 339 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index e75c6dd76a9a..6e3fe4f14934 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
| @@ -282,7 +282,7 @@ static int read_header(struct pstore *ps, int *new_snapshot) | |||
| 282 | */ | 282 | */ |
| 283 | if (!ps->store->chunk_size) { | 283 | if (!ps->store->chunk_size) { |
| 284 | ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, | 284 | ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, |
| 285 | bdev_hardsect_size(ps->store->cow->bdev) >> 9); | 285 | bdev_logical_block_size(ps->store->cow->bdev) >> 9); |
| 286 | ps->store->chunk_mask = ps->store->chunk_size - 1; | 286 | ps->store->chunk_mask = ps->store->chunk_size - 1; |
| 287 | ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; | 287 | ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; |
| 288 | chunk_size_supplied = 0; | 288 | chunk_size_supplied = 0; |
| @@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, | |||
| 636 | /* | 636 | /* |
| 637 | * Commit exceptions to disk. | 637 | * Commit exceptions to disk. |
| 638 | */ | 638 | */ |
| 639 | if (ps->valid && area_io(ps, WRITE)) | 639 | if (ps->valid && area_io(ps, WRITE_BARRIER)) |
| 640 | ps->valid = 0; | 640 | ps->valid = 0; |
| 641 | 641 | ||
| 642 | /* | 642 | /* |
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index d73f17fc7778..d573165cd2b7 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
| @@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 678 | 678 | ||
| 679 | ti->private = s; | 679 | ti->private = s; |
| 680 | ti->split_io = s->store->chunk_size; | 680 | ti->split_io = s->store->chunk_size; |
| 681 | ti->num_flush_requests = 1; | ||
| 681 | 682 | ||
| 682 | return 0; | 683 | return 0; |
| 683 | 684 | ||
| @@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
| 1030 | chunk_t chunk; | 1031 | chunk_t chunk; |
| 1031 | struct dm_snap_pending_exception *pe = NULL; | 1032 | struct dm_snap_pending_exception *pe = NULL; |
| 1032 | 1033 | ||
| 1034 | if (unlikely(bio_empty_barrier(bio))) { | ||
| 1035 | bio->bi_bdev = s->store->cow->bdev; | ||
| 1036 | return DM_MAPIO_REMAPPED; | ||
| 1037 | } | ||
| 1038 | |||
| 1033 | chunk = sector_to_chunk(s->store, bio->bi_sector); | 1039 | chunk = sector_to_chunk(s->store, bio->bi_sector); |
| 1034 | 1040 | ||
| 1035 | /* Full snapshots are not usable */ | 1041 | /* Full snapshots are not usable */ |
| @@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1338 | } | 1344 | } |
| 1339 | 1345 | ||
| 1340 | ti->private = dev; | 1346 | ti->private = dev; |
| 1347 | ti->num_flush_requests = 1; | ||
| 1348 | |||
| 1341 | return 0; | 1349 | return 0; |
| 1342 | } | 1350 | } |
| 1343 | 1351 | ||
| @@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio, | |||
| 1353 | struct dm_dev *dev = ti->private; | 1361 | struct dm_dev *dev = ti->private; |
| 1354 | bio->bi_bdev = dev->bdev; | 1362 | bio->bi_bdev = dev->bdev; |
| 1355 | 1363 | ||
| 1364 | if (unlikely(bio_empty_barrier(bio))) | ||
| 1365 | return DM_MAPIO_REMAPPED; | ||
| 1366 | |||
| 1356 | /* Only tell snapshots if this is a write */ | 1367 | /* Only tell snapshots if this is a write */ |
| 1357 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; | 1368 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; |
| 1358 | } | 1369 | } |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 41569bc60abc..b240e85ae39a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
| @@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 167 | sc->stripes = stripes; | 167 | sc->stripes = stripes; |
| 168 | sc->stripe_width = width; | 168 | sc->stripe_width = width; |
| 169 | ti->split_io = chunk_size; | 169 | ti->split_io = chunk_size; |
| 170 | ti->num_flush_requests = stripes; | ||
| 170 | 171 | ||
| 171 | sc->chunk_mask = ((sector_t) chunk_size) - 1; | 172 | sc->chunk_mask = ((sector_t) chunk_size) - 1; |
| 172 | for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) | 173 | for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) |
| @@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, | |||
| 211 | union map_info *map_context) | 212 | union map_info *map_context) |
| 212 | { | 213 | { |
| 213 | struct stripe_c *sc = (struct stripe_c *) ti->private; | 214 | struct stripe_c *sc = (struct stripe_c *) ti->private; |
| 215 | sector_t offset, chunk; | ||
| 216 | uint32_t stripe; | ||
| 214 | 217 | ||
| 215 | sector_t offset = bio->bi_sector - ti->begin; | 218 | if (unlikely(bio_empty_barrier(bio))) { |
| 216 | sector_t chunk = offset >> sc->chunk_shift; | 219 | BUG_ON(map_context->flush_request >= sc->stripes); |
| 217 | uint32_t stripe = sector_div(chunk, sc->stripes); | 220 | bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev; |
| 221 | return DM_MAPIO_REMAPPED; | ||
| 222 | } | ||
| 223 | |||
| 224 | offset = bio->bi_sector - ti->begin; | ||
| 225 | chunk = offset >> sc->chunk_shift; | ||
| 226 | stripe = sector_div(chunk, sc->stripes); | ||
| 218 | 227 | ||
| 219 | bio->bi_bdev = sc->stripe[stripe].dev->bdev; | 228 | bio->bi_bdev = sc->stripe[stripe].dev->bdev; |
| 220 | bio->bi_sector = sc->stripe[stripe].physical_start + | 229 | bio->bi_sector = sc->stripe[stripe].physical_start + |
| @@ -304,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, | |||
| 304 | return error; | 313 | return error; |
| 305 | } | 314 | } |
| 306 | 315 | ||
| 316 | static int stripe_iterate_devices(struct dm_target *ti, | ||
| 317 | iterate_devices_callout_fn fn, void *data) | ||
| 318 | { | ||
| 319 | struct stripe_c *sc = ti->private; | ||
| 320 | int ret = 0; | ||
| 321 | unsigned i = 0; | ||
| 322 | |||
| 323 | do | ||
| 324 | ret = fn(ti, sc->stripe[i].dev, | ||
| 325 | sc->stripe[i].physical_start, data); | ||
| 326 | while (!ret && ++i < sc->stripes); | ||
| 327 | |||
| 328 | return ret; | ||
| 329 | } | ||
| 330 | |||
| 307 | static struct target_type stripe_target = { | 331 | static struct target_type stripe_target = { |
| 308 | .name = "striped", | 332 | .name = "striped", |
| 309 | .version = {1, 1, 0}, | 333 | .version = {1, 2, 0}, |
| 310 | .module = THIS_MODULE, | 334 | .module = THIS_MODULE, |
| 311 | .ctr = stripe_ctr, | 335 | .ctr = stripe_ctr, |
| 312 | .dtr = stripe_dtr, | 336 | .dtr = stripe_dtr, |
| 313 | .map = stripe_map, | 337 | .map = stripe_map, |
| 314 | .end_io = stripe_end_io, | 338 | .end_io = stripe_end_io, |
| 315 | .status = stripe_status, | 339 | .status = stripe_status, |
| 340 | .iterate_devices = stripe_iterate_devices, | ||
| 316 | }; | 341 | }; |
| 317 | 342 | ||
| 318 | int __init dm_stripe_init(void) | 343 | int __init dm_stripe_init(void) |
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index a2a45e6c7c8b..4b045903a4e2 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c | |||
| @@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) | |||
| 57 | return strlen(buf); | 57 | return strlen(buf); |
| 58 | } | 58 | } |
| 59 | 59 | ||
| 60 | static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) | ||
| 61 | { | ||
| 62 | sprintf(buf, "%d\n", dm_suspended(md)); | ||
| 63 | |||
| 64 | return strlen(buf); | ||
| 65 | } | ||
| 66 | |||
| 60 | static DM_ATTR_RO(name); | 67 | static DM_ATTR_RO(name); |
| 61 | static DM_ATTR_RO(uuid); | 68 | static DM_ATTR_RO(uuid); |
| 69 | static DM_ATTR_RO(suspended); | ||
| 62 | 70 | ||
| 63 | static struct attribute *dm_attrs[] = { | 71 | static struct attribute *dm_attrs[] = { |
| 64 | &dm_attr_name.attr, | 72 | &dm_attr_name.attr, |
| 65 | &dm_attr_uuid.attr, | 73 | &dm_attr_uuid.attr, |
| 74 | &dm_attr_suspended.attr, | ||
| 66 | NULL, | 75 | NULL, |
| 67 | }; | 76 | }; |
| 68 | 77 | ||
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 429b50b975d5..4899ebe767c8 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
| @@ -41,6 +41,7 @@ | |||
| 41 | struct dm_table { | 41 | struct dm_table { |
| 42 | struct mapped_device *md; | 42 | struct mapped_device *md; |
| 43 | atomic_t holders; | 43 | atomic_t holders; |
| 44 | unsigned type; | ||
| 44 | 45 | ||
| 45 | /* btree table */ | 46 | /* btree table */ |
| 46 | unsigned int depth; | 47 | unsigned int depth; |
| @@ -62,15 +63,11 @@ struct dm_table { | |||
| 62 | /* a list of devices used by this table */ | 63 | /* a list of devices used by this table */ |
| 63 | struct list_head devices; | 64 | struct list_head devices; |
| 64 | 65 | ||
| 65 | /* | ||
| 66 | * These are optimistic limits taken from all the | ||
| 67 | * targets, some targets will need smaller limits. | ||
| 68 | */ | ||
| 69 | struct io_restrictions limits; | ||
| 70 | |||
| 71 | /* events get handed up using this callback */ | 66 | /* events get handed up using this callback */ |
| 72 | void (*event_fn)(void *); | 67 | void (*event_fn)(void *); |
| 73 | void *event_context; | 68 | void *event_context; |
| 69 | |||
| 70 | struct dm_md_mempools *mempools; | ||
| 74 | }; | 71 | }; |
| 75 | 72 | ||
| 76 | /* | 73 | /* |
| @@ -89,42 +86,6 @@ static unsigned int int_log(unsigned int n, unsigned int base) | |||
| 89 | } | 86 | } |
| 90 | 87 | ||
| 91 | /* | 88 | /* |
| 92 | * Returns the minimum that is _not_ zero, unless both are zero. | ||
| 93 | */ | ||
| 94 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
| 95 | |||
| 96 | /* | ||
| 97 | * Combine two io_restrictions, always taking the lower value. | ||
| 98 | */ | ||
| 99 | static void combine_restrictions_low(struct io_restrictions *lhs, | ||
| 100 | struct io_restrictions *rhs) | ||
| 101 | { | ||
| 102 | lhs->max_sectors = | ||
| 103 | min_not_zero(lhs->max_sectors, rhs->max_sectors); | ||
| 104 | |||
| 105 | lhs->max_phys_segments = | ||
| 106 | min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments); | ||
| 107 | |||
| 108 | lhs->max_hw_segments = | ||
| 109 | min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments); | ||
| 110 | |||
| 111 | lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size); | ||
| 112 | |||
| 113 | lhs->max_segment_size = | ||
| 114 | min_not_zero(lhs->max_segment_size, rhs->max_segment_size); | ||
| 115 | |||
| 116 | lhs->max_hw_sectors = | ||
| 117 | min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors); | ||
| 118 | |||
| 119 | lhs->seg_boundary_mask = | ||
| 120 | min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask); | ||
| 121 | |||
| 122 | lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn); | ||
| 123 | |||
| 124 | lhs->no_cluster |= rhs->no_cluster; | ||
| 125 | } | ||
| 126 | |||
| 127 | /* | ||
| 128 | * Calculate the index of the child node of the n'th node k'th key. | 89 | * Calculate the index of the child node of the n'th node k'th key. |
| 129 | */ | 90 | */ |
| 130 | static inline unsigned int get_child(unsigned int n, unsigned int k) | 91 | static inline unsigned int get_child(unsigned int n, unsigned int k) |
| @@ -266,6 +227,8 @@ static void free_devices(struct list_head *devices) | |||
| 266 | list_for_each_safe(tmp, next, devices) { | 227 | list_for_each_safe(tmp, next, devices) { |
| 267 | struct dm_dev_internal *dd = | 228 | struct dm_dev_internal *dd = |
| 268 | list_entry(tmp, struct dm_dev_internal, list); | 229 | list_entry(tmp, struct dm_dev_internal, list); |
| 230 | DMWARN("dm_table_destroy: dm_put_device call missing for %s", | ||
| 231 | dd->dm_dev.name); | ||
| 269 | kfree(dd); | 232 | kfree(dd); |
| 270 | } | 233 | } |
| 271 | } | 234 | } |
| @@ -295,12 +258,10 @@ void dm_table_destroy(struct dm_table *t) | |||
| 295 | vfree(t->highs); | 258 | vfree(t->highs); |
| 296 | 259 | ||
| 297 | /* free the device list */ | 260 | /* free the device list */ |
| 298 | if (t->devices.next != &t->devices) { | 261 | if (t->devices.next != &t->devices) |
| 299 | DMWARN("devices still present during destroy: " | ||
| 300 | "dm_table_remove_device calls missing"); | ||
| 301 | |||
| 302 | free_devices(&t->devices); | 262 | free_devices(&t->devices); |
| 303 | } | 263 | |
| 264 | dm_free_md_mempools(t->mempools); | ||
| 304 | 265 | ||
| 305 | kfree(t); | 266 | kfree(t); |
| 306 | } | 267 | } |
| @@ -384,15 +345,48 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) | |||
| 384 | /* | 345 | /* |
| 385 | * If possible, this checks an area of a destination device is valid. | 346 | * If possible, this checks an area of a destination device is valid. |
| 386 | */ | 347 | */ |
| 387 | static int check_device_area(struct dm_dev_internal *dd, sector_t start, | 348 | static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev, |
| 388 | sector_t len) | 349 | sector_t start, void *data) |
| 389 | { | 350 | { |
| 390 | sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT; | 351 | struct queue_limits *limits = data; |
| 352 | struct block_device *bdev = dev->bdev; | ||
| 353 | sector_t dev_size = | ||
| 354 | i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; | ||
| 355 | unsigned short logical_block_size_sectors = | ||
| 356 | limits->logical_block_size >> SECTOR_SHIFT; | ||
| 357 | char b[BDEVNAME_SIZE]; | ||
| 391 | 358 | ||
| 392 | if (!dev_size) | 359 | if (!dev_size) |
| 393 | return 1; | 360 | return 1; |
| 394 | 361 | ||
| 395 | return ((start < dev_size) && (len <= (dev_size - start))); | 362 | if ((start >= dev_size) || (start + ti->len > dev_size)) { |
| 363 | DMWARN("%s: %s too small for target", | ||
| 364 | dm_device_name(ti->table->md), bdevname(bdev, b)); | ||
| 365 | return 0; | ||
| 366 | } | ||
| 367 | |||
| 368 | if (logical_block_size_sectors <= 1) | ||
| 369 | return 1; | ||
| 370 | |||
| 371 | if (start & (logical_block_size_sectors - 1)) { | ||
| 372 | DMWARN("%s: start=%llu not aligned to h/w " | ||
| 373 | "logical block size %hu of %s", | ||
| 374 | dm_device_name(ti->table->md), | ||
| 375 | (unsigned long long)start, | ||
| 376 | limits->logical_block_size, bdevname(bdev, b)); | ||
| 377 | return 0; | ||
| 378 | } | ||
| 379 | |||
| 380 | if (ti->len & (logical_block_size_sectors - 1)) { | ||
| 381 | DMWARN("%s: len=%llu not aligned to h/w " | ||
| 382 | "logical block size %hu of %s", | ||
| 383 | dm_device_name(ti->table->md), | ||
| 384 | (unsigned long long)ti->len, | ||
| 385 | limits->logical_block_size, bdevname(bdev, b)); | ||
| 386 | return 0; | ||
| 387 | } | ||
| 388 | |||
| 389 | return 1; | ||
| 396 | } | 390 | } |
| 397 | 391 | ||
| 398 | /* | 392 | /* |
| @@ -478,38 +472,32 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, | |||
| 478 | } | 472 | } |
| 479 | atomic_inc(&dd->count); | 473 | atomic_inc(&dd->count); |
| 480 | 474 | ||
| 481 | if (!check_device_area(dd, start, len)) { | ||
| 482 | DMWARN("device %s too small for target", path); | ||
| 483 | dm_put_device(ti, &dd->dm_dev); | ||
| 484 | return -EINVAL; | ||
| 485 | } | ||
| 486 | |||
| 487 | *result = &dd->dm_dev; | 475 | *result = &dd->dm_dev; |
| 488 | |||
| 489 | return 0; | 476 | return 0; |
| 490 | } | 477 | } |
| 491 | 478 | ||
| 492 | void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) | 479 | /* |
| 480 | * Returns the minimum that is _not_ zero, unless both are zero. | ||
| 481 | */ | ||
| 482 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
| 483 | |||
| 484 | int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | ||
| 485 | sector_t start, void *data) | ||
| 493 | { | 486 | { |
| 487 | struct queue_limits *limits = data; | ||
| 488 | struct block_device *bdev = dev->bdev; | ||
| 494 | struct request_queue *q = bdev_get_queue(bdev); | 489 | struct request_queue *q = bdev_get_queue(bdev); |
| 495 | struct io_restrictions *rs = &ti->limits; | ||
| 496 | char b[BDEVNAME_SIZE]; | 490 | char b[BDEVNAME_SIZE]; |
| 497 | 491 | ||
| 498 | if (unlikely(!q)) { | 492 | if (unlikely(!q)) { |
| 499 | DMWARN("%s: Cannot set limits for nonexistent device %s", | 493 | DMWARN("%s: Cannot set limits for nonexistent device %s", |
| 500 | dm_device_name(ti->table->md), bdevname(bdev, b)); | 494 | dm_device_name(ti->table->md), bdevname(bdev, b)); |
| 501 | return; | 495 | return 0; |
| 502 | } | 496 | } |
| 503 | 497 | ||
| 504 | /* | 498 | if (blk_stack_limits(limits, &q->limits, start) < 0) |
| 505 | * Combine the device limits low. | 499 | DMWARN("%s: target device %s is misaligned", |
| 506 | * | 500 | dm_device_name(ti->table->md), bdevname(bdev, b)); |
| 507 | * FIXME: if we move an io_restriction struct | ||
| 508 | * into q this would just be a call to | ||
| 509 | * combine_restrictions_low() | ||
| 510 | */ | ||
| 511 | rs->max_sectors = | ||
| 512 | min_not_zero(rs->max_sectors, q->max_sectors); | ||
| 513 | 501 | ||
| 514 | /* | 502 | /* |
| 515 | * Check if merge fn is supported. | 503 | * Check if merge fn is supported. |
| @@ -518,47 +506,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) | |||
| 518 | */ | 506 | */ |
| 519 | 507 | ||
| 520 | if (q->merge_bvec_fn && !ti->type->merge) | 508 | if (q->merge_bvec_fn && !ti->type->merge) |
| 521 | rs->max_sectors = | 509 | limits->max_sectors = |
| 522 | min_not_zero(rs->max_sectors, | 510 | min_not_zero(limits->max_sectors, |
| 523 | (unsigned int) (PAGE_SIZE >> 9)); | 511 | (unsigned int) (PAGE_SIZE >> 9)); |
| 524 | 512 | return 0; | |
| 525 | rs->max_phys_segments = | ||
| 526 | min_not_zero(rs->max_phys_segments, | ||
| 527 | q->max_phys_segments); | ||
| 528 | |||
| 529 | rs->max_hw_segments = | ||
| 530 | min_not_zero(rs->max_hw_segments, q->max_hw_segments); | ||
| 531 | |||
| 532 | rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); | ||
| 533 | |||
| 534 | rs->max_segment_size = | ||
| 535 | min_not_zero(rs->max_segment_size, q->max_segment_size); | ||
| 536 | |||
| 537 | rs->max_hw_sectors = | ||
| 538 | min_not_zero(rs->max_hw_sectors, q->max_hw_sectors); | ||
| 539 | |||
| 540 | rs->seg_boundary_mask = | ||
| 541 | min_not_zero(rs->seg_boundary_mask, | ||
| 542 | q->seg_boundary_mask); | ||
| 543 | |||
| 544 | rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn); | ||
| 545 | |||
| 546 | rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); | ||
| 547 | } | 513 | } |
| 548 | EXPORT_SYMBOL_GPL(dm_set_device_limits); | 514 | EXPORT_SYMBOL_GPL(dm_set_device_limits); |
| 549 | 515 | ||
| 550 | int dm_get_device(struct dm_target *ti, const char *path, sector_t start, | 516 | int dm_get_device(struct dm_target *ti, const char *path, sector_t start, |
| 551 | sector_t len, fmode_t mode, struct dm_dev **result) | 517 | sector_t len, fmode_t mode, struct dm_dev **result) |
| 552 | { | 518 | { |
| 553 | int r = __table_get_device(ti->table, ti, path, | 519 | return __table_get_device(ti->table, ti, path, |
| 554 | start, len, mode, result); | 520 | start, len, mode, result); |
| 555 | |||
| 556 | if (!r) | ||
| 557 | dm_set_device_limits(ti, (*result)->bdev); | ||
| 558 | |||
| 559 | return r; | ||
| 560 | } | 521 | } |
| 561 | 522 | ||
| 523 | |||
| 562 | /* | 524 | /* |
| 563 | * Decrement a devices use count and remove it if necessary. | 525 | * Decrement a devices use count and remove it if necessary. |
| 564 | */ | 526 | */ |
| @@ -673,24 +635,78 @@ int dm_split_args(int *argc, char ***argvp, char *input) | |||
| 673 | return 0; | 635 | return 0; |
| 674 | } | 636 | } |
| 675 | 637 | ||
| 676 | static void check_for_valid_limits(struct io_restrictions *rs) | 638 | /* |
| 639 | * Impose necessary and sufficient conditions on a devices's table such | ||
| 640 | * that any incoming bio which respects its logical_block_size can be | ||
| 641 | * processed successfully. If it falls across the boundary between | ||
| 642 | * two or more targets, the size of each piece it gets split into must | ||
| 643 | * be compatible with the logical_block_size of the target processing it. | ||
| 644 | */ | ||
| 645 | static int validate_hardware_logical_block_alignment(struct dm_table *table, | ||
| 646 | struct queue_limits *limits) | ||
| 677 | { | 647 | { |
| 678 | if (!rs->max_sectors) | 648 | /* |
| 679 | rs->max_sectors = SAFE_MAX_SECTORS; | 649 | * This function uses arithmetic modulo the logical_block_size |
| 680 | if (!rs->max_hw_sectors) | 650 | * (in units of 512-byte sectors). |
| 681 | rs->max_hw_sectors = SAFE_MAX_SECTORS; | 651 | */ |
| 682 | if (!rs->max_phys_segments) | 652 | unsigned short device_logical_block_size_sects = |
| 683 | rs->max_phys_segments = MAX_PHYS_SEGMENTS; | 653 | limits->logical_block_size >> SECTOR_SHIFT; |
| 684 | if (!rs->max_hw_segments) | 654 | |
| 685 | rs->max_hw_segments = MAX_HW_SEGMENTS; | 655 | /* |
| 686 | if (!rs->hardsect_size) | 656 | * Offset of the start of the next table entry, mod logical_block_size. |
| 687 | rs->hardsect_size = 1 << SECTOR_SHIFT; | 657 | */ |
| 688 | if (!rs->max_segment_size) | 658 | unsigned short next_target_start = 0; |
| 689 | rs->max_segment_size = MAX_SEGMENT_SIZE; | 659 | |
| 690 | if (!rs->seg_boundary_mask) | 660 | /* |
| 691 | rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; | 661 | * Given an aligned bio that extends beyond the end of a |
| 692 | if (!rs->bounce_pfn) | 662 | * target, how many sectors must the next target handle? |
| 693 | rs->bounce_pfn = -1; | 663 | */ |
| 664 | unsigned short remaining = 0; | ||
| 665 | |||
| 666 | struct dm_target *uninitialized_var(ti); | ||
| 667 | struct queue_limits ti_limits; | ||
| 668 | unsigned i = 0; | ||
| 669 | |||
| 670 | /* | ||
| 671 | * Check each entry in the table in turn. | ||
| 672 | */ | ||
| 673 | while (i < dm_table_get_num_targets(table)) { | ||
| 674 | ti = dm_table_get_target(table, i++); | ||
| 675 | |||
| 676 | blk_set_default_limits(&ti_limits); | ||
| 677 | |||
| 678 | /* combine all target devices' limits */ | ||
| 679 | if (ti->type->iterate_devices) | ||
| 680 | ti->type->iterate_devices(ti, dm_set_device_limits, | ||
| 681 | &ti_limits); | ||
| 682 | |||
| 683 | /* | ||
| 684 | * If the remaining sectors fall entirely within this | ||
| 685 | * table entry are they compatible with its logical_block_size? | ||
| 686 | */ | ||
| 687 | if (remaining < ti->len && | ||
| 688 | remaining & ((ti_limits.logical_block_size >> | ||
| 689 | SECTOR_SHIFT) - 1)) | ||
| 690 | break; /* Error */ | ||
| 691 | |||
| 692 | next_target_start = | ||
| 693 | (unsigned short) ((next_target_start + ti->len) & | ||
| 694 | (device_logical_block_size_sects - 1)); | ||
| 695 | remaining = next_target_start ? | ||
| 696 | device_logical_block_size_sects - next_target_start : 0; | ||
| 697 | } | ||
| 698 | |||
| 699 | if (remaining) { | ||
| 700 | DMWARN("%s: table line %u (start sect %llu len %llu) " | ||
| 701 | "not aligned to h/w logical block size %hu", | ||
| 702 | dm_device_name(table->md), i, | ||
| 703 | (unsigned long long) ti->begin, | ||
| 704 | (unsigned long long) ti->len, | ||
| 705 | limits->logical_block_size); | ||
| 706 | return -EINVAL; | ||
| 707 | } | ||
| 708 | |||
| 709 | return 0; | ||
| 694 | } | 710 | } |
| 695 | 711 | ||
| 696 | int dm_table_add_target(struct dm_table *t, const char *type, | 712 | int dm_table_add_target(struct dm_table *t, const char *type, |
| @@ -745,9 +761,6 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
| 745 | 761 | ||
| 746 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; | 762 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; |
| 747 | 763 | ||
| 748 | /* FIXME: the plan is to combine high here and then have | ||
| 749 | * the merge fn apply the target level restrictions. */ | ||
| 750 | combine_restrictions_low(&t->limits, &tgt->limits); | ||
| 751 | return 0; | 764 | return 0; |
| 752 | 765 | ||
| 753 | bad: | 766 | bad: |
| @@ -756,6 +769,104 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
| 756 | return r; | 769 | return r; |
| 757 | } | 770 | } |
| 758 | 771 | ||
| 772 | int dm_table_set_type(struct dm_table *t) | ||
| 773 | { | ||
| 774 | unsigned i; | ||
| 775 | unsigned bio_based = 0, request_based = 0; | ||
| 776 | struct dm_target *tgt; | ||
| 777 | struct dm_dev_internal *dd; | ||
| 778 | struct list_head *devices; | ||
| 779 | |||
| 780 | for (i = 0; i < t->num_targets; i++) { | ||
| 781 | tgt = t->targets + i; | ||
| 782 | if (dm_target_request_based(tgt)) | ||
| 783 | request_based = 1; | ||
| 784 | else | ||
| 785 | bio_based = 1; | ||
| 786 | |||
| 787 | if (bio_based && request_based) { | ||
| 788 | DMWARN("Inconsistent table: different target types" | ||
| 789 | " can't be mixed up"); | ||
| 790 | return -EINVAL; | ||
| 791 | } | ||
| 792 | } | ||
| 793 | |||
| 794 | if (bio_based) { | ||
| 795 | /* We must use this table as bio-based */ | ||
| 796 | t->type = DM_TYPE_BIO_BASED; | ||
| 797 | return 0; | ||
| 798 | } | ||
| 799 | |||
| 800 | BUG_ON(!request_based); /* No targets in this table */ | ||
| 801 | |||
| 802 | /* Non-request-stackable devices can't be used for request-based dm */ | ||
| 803 | devices = dm_table_get_devices(t); | ||
| 804 | list_for_each_entry(dd, devices, list) { | ||
| 805 | if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { | ||
| 806 | DMWARN("table load rejected: including" | ||
| 807 | " non-request-stackable devices"); | ||
| 808 | return -EINVAL; | ||
| 809 | } | ||
| 810 | } | ||
| 811 | |||
| 812 | /* | ||
| 813 | * Request-based dm supports only tables that have a single target now. | ||
| 814 | * To support multiple targets, request splitting support is needed, | ||
| 815 | * and that needs lots of changes in the block-layer. | ||
| 816 | * (e.g. request completion process for partial completion.) | ||
| 817 | */ | ||
| 818 | if (t->num_targets > 1) { | ||
| 819 | DMWARN("Request-based dm doesn't support multiple targets yet"); | ||
| 820 | return -EINVAL; | ||
| 821 | } | ||
| 822 | |||
| 823 | t->type = DM_TYPE_REQUEST_BASED; | ||
| 824 | |||
| 825 | return 0; | ||
| 826 | } | ||
| 827 | |||
| 828 | unsigned dm_table_get_type(struct dm_table *t) | ||
| 829 | { | ||
| 830 | return t->type; | ||
| 831 | } | ||
| 832 | |||
| 833 | bool dm_table_bio_based(struct dm_table *t) | ||
| 834 | { | ||
| 835 | return dm_table_get_type(t) == DM_TYPE_BIO_BASED; | ||
| 836 | } | ||
| 837 | |||
| 838 | bool dm_table_request_based(struct dm_table *t) | ||
| 839 | { | ||
| 840 | return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; | ||
| 841 | } | ||
| 842 | |||
| 843 | int dm_table_alloc_md_mempools(struct dm_table *t) | ||
| 844 | { | ||
| 845 | unsigned type = dm_table_get_type(t); | ||
| 846 | |||
| 847 | if (unlikely(type == DM_TYPE_NONE)) { | ||
| 848 | DMWARN("no table type is set, can't allocate mempools"); | ||
| 849 | return -EINVAL; | ||
| 850 | } | ||
| 851 | |||
| 852 | t->mempools = dm_alloc_md_mempools(type); | ||
| 853 | if (!t->mempools) | ||
| 854 | return -ENOMEM; | ||
| 855 | |||
| 856 | return 0; | ||
| 857 | } | ||
| 858 | |||
| 859 | void dm_table_free_md_mempools(struct dm_table *t) | ||
| 860 | { | ||
| 861 | dm_free_md_mempools(t->mempools); | ||
| 862 | t->mempools = NULL; | ||
| 863 | } | ||
| 864 | |||
| 865 | struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) | ||
| 866 | { | ||
| 867 | return t->mempools; | ||
| 868 | } | ||
| 869 | |||
| 759 | static int setup_indexes(struct dm_table *t) | 870 | static int setup_indexes(struct dm_table *t) |
| 760 | { | 871 | { |
| 761 | int i; | 872 | int i; |
| @@ -790,8 +901,6 @@ int dm_table_complete(struct dm_table *t) | |||
| 790 | int r = 0; | 901 | int r = 0; |
| 791 | unsigned int leaf_nodes; | 902 | unsigned int leaf_nodes; |
| 792 | 903 | ||
| 793 | check_for_valid_limits(&t->limits); | ||
| 794 | |||
| 795 | /* how many indexes will the btree have ? */ | 904 | /* how many indexes will the btree have ? */ |
| 796 | leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); | 905 | leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); |
| 797 | t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); | 906 | t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); |
| @@ -867,6 +976,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) | |||
| 867 | } | 976 | } |
| 868 | 977 | ||
| 869 | /* | 978 | /* |
| 979 | * Establish the new table's queue_limits and validate them. | ||
| 980 | */ | ||
| 981 | int dm_calculate_queue_limits(struct dm_table *table, | ||
| 982 | struct queue_limits *limits) | ||
| 983 | { | ||
| 984 | struct dm_target *uninitialized_var(ti); | ||
| 985 | struct queue_limits ti_limits; | ||
| 986 | unsigned i = 0; | ||
| 987 | |||
| 988 | blk_set_default_limits(limits); | ||
| 989 | |||
| 990 | while (i < dm_table_get_num_targets(table)) { | ||
| 991 | blk_set_default_limits(&ti_limits); | ||
| 992 | |||
| 993 | ti = dm_table_get_target(table, i++); | ||
| 994 | |||
| 995 | if (!ti->type->iterate_devices) | ||
| 996 | goto combine_limits; | ||
| 997 | |||
| 998 | /* | ||
| 999 | * Combine queue limits of all the devices this target uses. | ||
| 1000 | */ | ||
| 1001 | ti->type->iterate_devices(ti, dm_set_device_limits, | ||
| 1002 | &ti_limits); | ||
| 1003 | |||
| 1004 | /* | ||
| 1005 | * Check each device area is consistent with the target's | ||
| 1006 | * overall queue limits. | ||
| 1007 | */ | ||
| 1008 | if (!ti->type->iterate_devices(ti, device_area_is_valid, | ||
| 1009 | &ti_limits)) | ||
| 1010 | return -EINVAL; | ||
| 1011 | |||
| 1012 | combine_limits: | ||
| 1013 | /* | ||
| 1014 | * Merge this target's queue limits into the overall limits | ||
| 1015 | * for the table. | ||
| 1016 | */ | ||
| 1017 | if (blk_stack_limits(limits, &ti_limits, 0) < 0) | ||
| 1018 | DMWARN("%s: target device " | ||
| 1019 | "(start sect %llu len %llu) " | ||
| 1020 | "is misaligned", | ||
| 1021 | dm_device_name(table->md), | ||
| 1022 | (unsigned long long) ti->begin, | ||
| 1023 | (unsigned long long) ti->len); | ||
| 1024 | } | ||
| 1025 | |||
| 1026 | return validate_hardware_logical_block_alignment(table, limits); | ||
| 1027 | } | ||
| 1028 | |||
| 1029 | /* | ||
| 870 | * Set the integrity profile for this device if all devices used have | 1030 | * Set the integrity profile for this device if all devices used have |
| 871 | * matching profiles. | 1031 | * matching profiles. |
| 872 | */ | 1032 | */ |
| @@ -905,27 +1065,42 @@ no_integrity: | |||
| 905 | return; | 1065 | return; |
| 906 | } | 1066 | } |
| 907 | 1067 | ||
| 908 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) | 1068 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, |
| 1069 | struct queue_limits *limits) | ||
| 909 | { | 1070 | { |
| 910 | /* | 1071 | /* |
| 911 | * Make sure we obey the optimistic sub devices | 1072 | * Each target device in the table has a data area that should normally |
| 912 | * restrictions. | 1073 | * be aligned such that the DM device's alignment_offset is 0. |
| 1074 | * FIXME: Propagate alignment_offsets up the stack and warn of | ||
| 1075 | * sub-optimal or inconsistent settings. | ||
| 1076 | */ | ||
| 1077 | limits->alignment_offset = 0; | ||
| 1078 | limits->misaligned = 0; | ||
| 1079 | |||
| 1080 | /* | ||
| 1081 | * Copy table's limits to the DM device's request_queue | ||
| 913 | */ | 1082 | */ |
| 914 | blk_queue_max_sectors(q, t->limits.max_sectors); | 1083 | q->limits = *limits; |
| 915 | q->max_phys_segments = t->limits.max_phys_segments; | 1084 | |
| 916 | q->max_hw_segments = t->limits.max_hw_segments; | 1085 | if (limits->no_cluster) |
| 917 | q->hardsect_size = t->limits.hardsect_size; | ||
| 918 | q->max_segment_size = t->limits.max_segment_size; | ||
| 919 | q->max_hw_sectors = t->limits.max_hw_sectors; | ||
| 920 | q->seg_boundary_mask = t->limits.seg_boundary_mask; | ||
| 921 | q->bounce_pfn = t->limits.bounce_pfn; | ||
| 922 | |||
| 923 | if (t->limits.no_cluster) | ||
| 924 | queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); | 1086 | queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); |
| 925 | else | 1087 | else |
| 926 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); | 1088 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); |
| 927 | 1089 | ||
| 928 | dm_table_set_integrity(t); | 1090 | dm_table_set_integrity(t); |
| 1091 | |||
| 1092 | /* | ||
| 1093 | * QUEUE_FLAG_STACKABLE must be set after all queue settings are | ||
| 1094 | * visible to other CPUs because, once the flag is set, incoming bios | ||
| 1095 | * are processed by request-based dm, which refers to the queue | ||
| 1096 | * settings. | ||
| 1097 | * Until the flag set, bios are passed to bio-based dm and queued to | ||
| 1098 | * md->deferred where queue settings are not needed yet. | ||
| 1099 | * Those bios are passed to request-based dm at the resume time. | ||
| 1100 | */ | ||
| 1101 | smp_mb(); | ||
| 1102 | if (dm_table_request_based(t)) | ||
| 1103 | queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); | ||
| 929 | } | 1104 | } |
| 930 | 1105 | ||
| 931 | unsigned int dm_table_get_num_targets(struct dm_table *t) | 1106 | unsigned int dm_table_get_num_targets(struct dm_table *t) |
| @@ -1021,6 +1196,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) | |||
| 1021 | return r; | 1196 | return r; |
| 1022 | } | 1197 | } |
| 1023 | 1198 | ||
| 1199 | int dm_table_any_busy_target(struct dm_table *t) | ||
| 1200 | { | ||
| 1201 | unsigned i; | ||
| 1202 | struct dm_target *ti; | ||
| 1203 | |||
| 1204 | for (i = 0; i < t->num_targets; i++) { | ||
| 1205 | ti = t->targets + i; | ||
| 1206 | if (ti->type->busy && ti->type->busy(ti)) | ||
| 1207 | return 1; | ||
| 1208 | } | ||
| 1209 | |||
| 1210 | return 0; | ||
| 1211 | } | ||
| 1212 | |||
| 1024 | void dm_table_unplug_all(struct dm_table *t) | 1213 | void dm_table_unplug_all(struct dm_table *t) |
| 1025 | { | 1214 | { |
| 1026 | struct dm_dev_internal *dd; | 1215 | struct dm_dev_internal *dd; |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 424f7b048c30..3c6d4ee8921d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
| @@ -19,11 +19,18 @@ | |||
| 19 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
| 20 | #include <linux/idr.h> | 20 | #include <linux/idr.h> |
| 21 | #include <linux/hdreg.h> | 21 | #include <linux/hdreg.h> |
| 22 | #include <linux/blktrace_api.h> | 22 | |
| 23 | #include <trace/block.h> | 23 | #include <trace/events/block.h> |
| 24 | 24 | ||
| 25 | #define DM_MSG_PREFIX "core" | 25 | #define DM_MSG_PREFIX "core" |
| 26 | 26 | ||
| 27 | /* | ||
| 28 | * Cookies are numeric values sent with CHANGE and REMOVE | ||
| 29 | * uevents while resuming, removing or renaming the device. | ||
| 30 | */ | ||
| 31 | #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" | ||
| 32 | #define DM_COOKIE_LENGTH 24 | ||
| 33 | |||
| 27 | static const char *_name = DM_NAME; | 34 | static const char *_name = DM_NAME; |
| 28 | 35 | ||
| 29 | static unsigned int major = 0; | 36 | static unsigned int major = 0; |
| @@ -53,8 +60,6 @@ struct dm_target_io { | |||
| 53 | union map_info info; | 60 | union map_info info; |
| 54 | }; | 61 | }; |
| 55 | 62 | ||
| 56 | DEFINE_TRACE(block_bio_complete); | ||
| 57 | |||
| 58 | /* | 63 | /* |
| 59 | * For request-based dm. | 64 | * For request-based dm. |
| 60 | * One of these is allocated per request. | 65 | * One of these is allocated per request. |
| @@ -73,7 +78,7 @@ struct dm_rq_target_io { | |||
| 73 | */ | 78 | */ |
| 74 | struct dm_rq_clone_bio_info { | 79 | struct dm_rq_clone_bio_info { |
| 75 | struct bio *orig; | 80 | struct bio *orig; |
| 76 | struct request *rq; | 81 | struct dm_rq_target_io *tio; |
| 77 | }; | 82 | }; |
| 78 | 83 | ||
| 79 | union map_info *dm_get_mapinfo(struct bio *bio) | 84 | union map_info *dm_get_mapinfo(struct bio *bio) |
| @@ -83,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio) | |||
| 83 | return NULL; | 88 | return NULL; |
| 84 | } | 89 | } |
| 85 | 90 | ||
| 91 | union map_info *dm_get_rq_mapinfo(struct request *rq) | ||
| 92 | { | ||
| 93 | if (rq && rq->end_io_data) | ||
| 94 | return &((struct dm_rq_target_io *)rq->end_io_data)->info; | ||
| 95 | return NULL; | ||
| 96 | } | ||
| 97 | EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); | ||
| 98 | |||
| 86 | #define MINOR_ALLOCED ((void *)-1) | 99 | #define MINOR_ALLOCED ((void *)-1) |
| 87 | 100 | ||
| 88 | /* | 101 | /* |
| @@ -159,13 +172,31 @@ struct mapped_device { | |||
| 159 | * freeze/thaw support require holding onto a super block | 172 | * freeze/thaw support require holding onto a super block |
| 160 | */ | 173 | */ |
| 161 | struct super_block *frozen_sb; | 174 | struct super_block *frozen_sb; |
| 162 | struct block_device *suspended_bdev; | 175 | struct block_device *bdev; |
| 163 | 176 | ||
| 164 | /* forced geometry settings */ | 177 | /* forced geometry settings */ |
| 165 | struct hd_geometry geometry; | 178 | struct hd_geometry geometry; |
| 166 | 179 | ||
| 180 | /* marker of flush suspend for request-based dm */ | ||
| 181 | struct request suspend_rq; | ||
| 182 | |||
| 183 | /* For saving the address of __make_request for request based dm */ | ||
| 184 | make_request_fn *saved_make_request_fn; | ||
| 185 | |||
| 167 | /* sysfs handle */ | 186 | /* sysfs handle */ |
| 168 | struct kobject kobj; | 187 | struct kobject kobj; |
| 188 | |||
| 189 | /* zero-length barrier that will be cloned and submitted to targets */ | ||
| 190 | struct bio barrier_bio; | ||
| 191 | }; | ||
| 192 | |||
| 193 | /* | ||
| 194 | * For mempools pre-allocation at the table loading time. | ||
| 195 | */ | ||
| 196 | struct dm_md_mempools { | ||
| 197 | mempool_t *io_pool; | ||
| 198 | mempool_t *tio_pool; | ||
| 199 | struct bio_set *bs; | ||
| 169 | }; | 200 | }; |
| 170 | 201 | ||
| 171 | #define MIN_IOS 256 | 202 | #define MIN_IOS 256 |
| @@ -393,14 +424,29 @@ static void free_io(struct mapped_device *md, struct dm_io *io) | |||
| 393 | mempool_free(io, md->io_pool); | 424 | mempool_free(io, md->io_pool); |
| 394 | } | 425 | } |
| 395 | 426 | ||
| 396 | static struct dm_target_io *alloc_tio(struct mapped_device *md) | 427 | static void free_tio(struct mapped_device *md, struct dm_target_io *tio) |
| 428 | { | ||
| 429 | mempool_free(tio, md->tio_pool); | ||
| 430 | } | ||
| 431 | |||
| 432 | static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) | ||
| 397 | { | 433 | { |
| 398 | return mempool_alloc(md->tio_pool, GFP_NOIO); | 434 | return mempool_alloc(md->tio_pool, GFP_ATOMIC); |
| 399 | } | 435 | } |
| 400 | 436 | ||
| 401 | static void free_tio(struct mapped_device *md, struct dm_target_io *tio) | 437 | static void free_rq_tio(struct dm_rq_target_io *tio) |
| 402 | { | 438 | { |
| 403 | mempool_free(tio, md->tio_pool); | 439 | mempool_free(tio, tio->md->tio_pool); |
| 440 | } | ||
| 441 | |||
| 442 | static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) | ||
| 443 | { | ||
| 444 | return mempool_alloc(md->io_pool, GFP_ATOMIC); | ||
| 445 | } | ||
| 446 | |||
| 447 | static void free_bio_info(struct dm_rq_clone_bio_info *info) | ||
| 448 | { | ||
| 449 | mempool_free(info, info->tio->md->io_pool); | ||
| 404 | } | 450 | } |
| 405 | 451 | ||
| 406 | static void start_io_acct(struct dm_io *io) | 452 | static void start_io_acct(struct dm_io *io) |
| @@ -466,12 +512,13 @@ static void queue_io(struct mapped_device *md, struct bio *bio) | |||
| 466 | struct dm_table *dm_get_table(struct mapped_device *md) | 512 | struct dm_table *dm_get_table(struct mapped_device *md) |
| 467 | { | 513 | { |
| 468 | struct dm_table *t; | 514 | struct dm_table *t; |
| 515 | unsigned long flags; | ||
| 469 | 516 | ||
| 470 | read_lock(&md->map_lock); | 517 | read_lock_irqsave(&md->map_lock, flags); |
| 471 | t = md->map; | 518 | t = md->map; |
| 472 | if (t) | 519 | if (t) |
| 473 | dm_table_get(t); | 520 | dm_table_get(t); |
| 474 | read_unlock(&md->map_lock); | 521 | read_unlock_irqrestore(&md->map_lock, flags); |
| 475 | 522 | ||
| 476 | return t; | 523 | return t; |
| 477 | } | 524 | } |
| @@ -538,9 +585,11 @@ static void dec_pending(struct dm_io *io, int error) | |||
| 538 | * Target requested pushing back the I/O. | 585 | * Target requested pushing back the I/O. |
| 539 | */ | 586 | */ |
| 540 | spin_lock_irqsave(&md->deferred_lock, flags); | 587 | spin_lock_irqsave(&md->deferred_lock, flags); |
| 541 | if (__noflush_suspending(md)) | 588 | if (__noflush_suspending(md)) { |
| 542 | bio_list_add_head(&md->deferred, io->bio); | 589 | if (!bio_barrier(io->bio)) |
| 543 | else | 590 | bio_list_add_head(&md->deferred, |
| 591 | io->bio); | ||
| 592 | } else | ||
| 544 | /* noflush suspend was interrupted. */ | 593 | /* noflush suspend was interrupted. */ |
| 545 | io->error = -EIO; | 594 | io->error = -EIO; |
| 546 | spin_unlock_irqrestore(&md->deferred_lock, flags); | 595 | spin_unlock_irqrestore(&md->deferred_lock, flags); |
| @@ -555,7 +604,8 @@ static void dec_pending(struct dm_io *io, int error) | |||
| 555 | * a per-device variable for error reporting. | 604 | * a per-device variable for error reporting. |
| 556 | * Note that you can't touch the bio after end_io_acct | 605 | * Note that you can't touch the bio after end_io_acct |
| 557 | */ | 606 | */ |
| 558 | md->barrier_error = io_error; | 607 | if (!md->barrier_error && io_error != -EOPNOTSUPP) |
| 608 | md->barrier_error = io_error; | ||
| 559 | end_io_acct(io); | 609 | end_io_acct(io); |
| 560 | } else { | 610 | } else { |
| 561 | end_io_acct(io); | 611 | end_io_acct(io); |
| @@ -609,6 +659,262 @@ static void clone_endio(struct bio *bio, int error) | |||
| 609 | dec_pending(io, error); | 659 | dec_pending(io, error); |
| 610 | } | 660 | } |
| 611 | 661 | ||
| 662 | /* | ||
| 663 | * Partial completion handling for request-based dm | ||
| 664 | */ | ||
| 665 | static void end_clone_bio(struct bio *clone, int error) | ||
| 666 | { | ||
| 667 | struct dm_rq_clone_bio_info *info = clone->bi_private; | ||
| 668 | struct dm_rq_target_io *tio = info->tio; | ||
| 669 | struct bio *bio = info->orig; | ||
| 670 | unsigned int nr_bytes = info->orig->bi_size; | ||
| 671 | |||
| 672 | bio_put(clone); | ||
| 673 | |||
| 674 | if (tio->error) | ||
| 675 | /* | ||
| 676 | * An error has already been detected on the request. | ||
| 677 | * Once error occurred, just let clone->end_io() handle | ||
| 678 | * the remainder. | ||
| 679 | */ | ||
| 680 | return; | ||
| 681 | else if (error) { | ||
| 682 | /* | ||
| 683 | * Don't notice the error to the upper layer yet. | ||
| 684 | * The error handling decision is made by the target driver, | ||
| 685 | * when the request is completed. | ||
| 686 | */ | ||
| 687 | tio->error = error; | ||
| 688 | return; | ||
| 689 | } | ||
| 690 | |||
| 691 | /* | ||
| 692 | * I/O for the bio successfully completed. | ||
| 693 | * Notice the data completion to the upper layer. | ||
| 694 | */ | ||
| 695 | |||
| 696 | /* | ||
| 697 | * bios are processed from the head of the list. | ||
| 698 | * So the completing bio should always be rq->bio. | ||
| 699 | * If it's not, something wrong is happening. | ||
| 700 | */ | ||
| 701 | if (tio->orig->bio != bio) | ||
| 702 | DMERR("bio completion is going in the middle of the request"); | ||
| 703 | |||
| 704 | /* | ||
| 705 | * Update the original request. | ||
| 706 | * Do not use blk_end_request() here, because it may complete | ||
| 707 | * the original request before the clone, and break the ordering. | ||
| 708 | */ | ||
| 709 | blk_update_request(tio->orig, 0, nr_bytes); | ||
| 710 | } | ||
| 711 | |||
| 712 | /* | ||
| 713 | * Don't touch any member of the md after calling this function because | ||
| 714 | * the md may be freed in dm_put() at the end of this function. | ||
| 715 | * Or do dm_get() before calling this function and dm_put() later. | ||
| 716 | */ | ||
| 717 | static void rq_completed(struct mapped_device *md, int run_queue) | ||
| 718 | { | ||
| 719 | int wakeup_waiters = 0; | ||
| 720 | struct request_queue *q = md->queue; | ||
| 721 | unsigned long flags; | ||
| 722 | |||
| 723 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 724 | if (!queue_in_flight(q)) | ||
| 725 | wakeup_waiters = 1; | ||
| 726 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 727 | |||
| 728 | /* nudge anyone waiting on suspend queue */ | ||
| 729 | if (wakeup_waiters) | ||
| 730 | wake_up(&md->wait); | ||
| 731 | |||
| 732 | if (run_queue) | ||
| 733 | blk_run_queue(q); | ||
| 734 | |||
| 735 | /* | ||
| 736 | * dm_put() must be at the end of this function. See the comment above | ||
| 737 | */ | ||
| 738 | dm_put(md); | ||
| 739 | } | ||
| 740 | |||
| 741 | static void dm_unprep_request(struct request *rq) | ||
| 742 | { | ||
| 743 | struct request *clone = rq->special; | ||
| 744 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 745 | |||
| 746 | rq->special = NULL; | ||
| 747 | rq->cmd_flags &= ~REQ_DONTPREP; | ||
| 748 | |||
| 749 | blk_rq_unprep_clone(clone); | ||
| 750 | free_rq_tio(tio); | ||
| 751 | } | ||
| 752 | |||
| 753 | /* | ||
| 754 | * Requeue the original request of a clone. | ||
| 755 | */ | ||
| 756 | void dm_requeue_unmapped_request(struct request *clone) | ||
| 757 | { | ||
| 758 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 759 | struct mapped_device *md = tio->md; | ||
| 760 | struct request *rq = tio->orig; | ||
| 761 | struct request_queue *q = rq->q; | ||
| 762 | unsigned long flags; | ||
| 763 | |||
| 764 | dm_unprep_request(rq); | ||
| 765 | |||
| 766 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 767 | if (elv_queue_empty(q)) | ||
| 768 | blk_plug_device(q); | ||
| 769 | blk_requeue_request(q, rq); | ||
| 770 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 771 | |||
| 772 | rq_completed(md, 0); | ||
| 773 | } | ||
| 774 | EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); | ||
| 775 | |||
| 776 | static void __stop_queue(struct request_queue *q) | ||
| 777 | { | ||
| 778 | blk_stop_queue(q); | ||
| 779 | } | ||
| 780 | |||
| 781 | static void stop_queue(struct request_queue *q) | ||
| 782 | { | ||
| 783 | unsigned long flags; | ||
| 784 | |||
| 785 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 786 | __stop_queue(q); | ||
| 787 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 788 | } | ||
| 789 | |||
| 790 | static void __start_queue(struct request_queue *q) | ||
| 791 | { | ||
| 792 | if (blk_queue_stopped(q)) | ||
| 793 | blk_start_queue(q); | ||
| 794 | } | ||
| 795 | |||
| 796 | static void start_queue(struct request_queue *q) | ||
| 797 | { | ||
| 798 | unsigned long flags; | ||
| 799 | |||
| 800 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 801 | __start_queue(q); | ||
| 802 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 803 | } | ||
| 804 | |||
| 805 | /* | ||
| 806 | * Complete the clone and the original request. | ||
| 807 | * Must be called without queue lock. | ||
| 808 | */ | ||
| 809 | static void dm_end_request(struct request *clone, int error) | ||
| 810 | { | ||
| 811 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 812 | struct mapped_device *md = tio->md; | ||
| 813 | struct request *rq = tio->orig; | ||
| 814 | |||
| 815 | if (blk_pc_request(rq)) { | ||
| 816 | rq->errors = clone->errors; | ||
| 817 | rq->resid_len = clone->resid_len; | ||
| 818 | |||
| 819 | if (rq->sense) | ||
| 820 | /* | ||
| 821 | * We are using the sense buffer of the original | ||
| 822 | * request. | ||
| 823 | * So setting the length of the sense data is enough. | ||
| 824 | */ | ||
| 825 | rq->sense_len = clone->sense_len; | ||
| 826 | } | ||
| 827 | |||
| 828 | BUG_ON(clone->bio); | ||
| 829 | free_rq_tio(tio); | ||
| 830 | |||
| 831 | blk_end_request_all(rq, error); | ||
| 832 | |||
| 833 | rq_completed(md, 1); | ||
| 834 | } | ||
| 835 | |||
| 836 | /* | ||
| 837 | * Request completion handler for request-based dm | ||
| 838 | */ | ||
| 839 | static void dm_softirq_done(struct request *rq) | ||
| 840 | { | ||
| 841 | struct request *clone = rq->completion_data; | ||
| 842 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 843 | dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; | ||
| 844 | int error = tio->error; | ||
| 845 | |||
| 846 | if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) | ||
| 847 | error = rq_end_io(tio->ti, clone, error, &tio->info); | ||
| 848 | |||
| 849 | if (error <= 0) | ||
| 850 | /* The target wants to complete the I/O */ | ||
| 851 | dm_end_request(clone, error); | ||
| 852 | else if (error == DM_ENDIO_INCOMPLETE) | ||
| 853 | /* The target will handle the I/O */ | ||
| 854 | return; | ||
| 855 | else if (error == DM_ENDIO_REQUEUE) | ||
| 856 | /* The target wants to requeue the I/O */ | ||
| 857 | dm_requeue_unmapped_request(clone); | ||
| 858 | else { | ||
| 859 | DMWARN("unimplemented target endio return value: %d", error); | ||
| 860 | BUG(); | ||
| 861 | } | ||
| 862 | } | ||
| 863 | |||
| 864 | /* | ||
| 865 | * Complete the clone and the original request with the error status | ||
| 866 | * through softirq context. | ||
| 867 | */ | ||
| 868 | static void dm_complete_request(struct request *clone, int error) | ||
| 869 | { | ||
| 870 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 871 | struct request *rq = tio->orig; | ||
| 872 | |||
| 873 | tio->error = error; | ||
| 874 | rq->completion_data = clone; | ||
| 875 | blk_complete_request(rq); | ||
| 876 | } | ||
| 877 | |||
| 878 | /* | ||
| 879 | * Complete the not-mapped clone and the original request with the error status | ||
| 880 | * through softirq context. | ||
| 881 | * Target's rq_end_io() function isn't called. | ||
| 882 | * This may be used when the target's map_rq() function fails. | ||
| 883 | */ | ||
| 884 | void dm_kill_unmapped_request(struct request *clone, int error) | ||
| 885 | { | ||
| 886 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 887 | struct request *rq = tio->orig; | ||
| 888 | |||
| 889 | rq->cmd_flags |= REQ_FAILED; | ||
| 890 | dm_complete_request(clone, error); | ||
| 891 | } | ||
| 892 | EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); | ||
| 893 | |||
| 894 | /* | ||
| 895 | * Called with the queue lock held | ||
| 896 | */ | ||
| 897 | static void end_clone_request(struct request *clone, int error) | ||
| 898 | { | ||
| 899 | /* | ||
| 900 | * For just cleaning up the information of the queue in which | ||
| 901 | * the clone was dispatched. | ||
| 902 | * The clone is *NOT* freed actually here because it is alloced from | ||
| 903 | * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. | ||
| 904 | */ | ||
| 905 | __blk_put_request(clone->q, clone); | ||
| 906 | |||
| 907 | /* | ||
| 908 | * Actual request completion is done in a softirq context which doesn't | ||
| 909 | * hold the queue lock. Otherwise, deadlock could occur because: | ||
| 910 | * - another request may be submitted by the upper level driver | ||
| 911 | * of the stacking during the completion | ||
| 912 | * - the submission which requires queue lock may be done | ||
| 913 | * against this queue | ||
| 914 | */ | ||
| 915 | dm_complete_request(clone, error); | ||
| 916 | } | ||
| 917 | |||
| 612 | static sector_t max_io_len(struct mapped_device *md, | 918 | static sector_t max_io_len(struct mapped_device *md, |
| 613 | sector_t sector, struct dm_target *ti) | 919 | sector_t sector, struct dm_target *ti) |
| 614 | { | 920 | { |
| @@ -636,11 +942,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, | |||
| 636 | sector_t sector; | 942 | sector_t sector; |
| 637 | struct mapped_device *md; | 943 | struct mapped_device *md; |
| 638 | 944 | ||
| 639 | /* | ||
| 640 | * Sanity checks. | ||
| 641 | */ | ||
| 642 | BUG_ON(!clone->bi_size); | ||
| 643 | |||
| 644 | clone->bi_end_io = clone_endio; | 945 | clone->bi_end_io = clone_endio; |
| 645 | clone->bi_private = tio; | 946 | clone->bi_private = tio; |
| 646 | 947 | ||
| @@ -656,8 +957,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, | |||
| 656 | /* the bio has been remapped so dispatch it */ | 957 | /* the bio has been remapped so dispatch it */ |
| 657 | 958 | ||
| 658 | trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, | 959 | trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, |
| 659 | tio->io->bio->bi_bdev->bd_dev, | 960 | tio->io->bio->bi_bdev->bd_dev, sector); |
| 660 | clone->bi_sector, sector); | ||
| 661 | 961 | ||
| 662 | generic_make_request(clone); | 962 | generic_make_request(clone); |
| 663 | } else if (r < 0 || r == DM_MAPIO_REQUEUE) { | 963 | } else if (r < 0 || r == DM_MAPIO_REQUEUE) { |
| @@ -755,6 +1055,48 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, | |||
| 755 | return clone; | 1055 | return clone; |
| 756 | } | 1056 | } |
| 757 | 1057 | ||
| 1058 | static struct dm_target_io *alloc_tio(struct clone_info *ci, | ||
| 1059 | struct dm_target *ti) | ||
| 1060 | { | ||
| 1061 | struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); | ||
| 1062 | |||
| 1063 | tio->io = ci->io; | ||
| 1064 | tio->ti = ti; | ||
| 1065 | memset(&tio->info, 0, sizeof(tio->info)); | ||
| 1066 | |||
| 1067 | return tio; | ||
| 1068 | } | ||
| 1069 | |||
| 1070 | static void __flush_target(struct clone_info *ci, struct dm_target *ti, | ||
| 1071 | unsigned flush_nr) | ||
| 1072 | { | ||
| 1073 | struct dm_target_io *tio = alloc_tio(ci, ti); | ||
| 1074 | struct bio *clone; | ||
| 1075 | |||
| 1076 | tio->info.flush_request = flush_nr; | ||
| 1077 | |||
| 1078 | clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); | ||
| 1079 | __bio_clone(clone, ci->bio); | ||
| 1080 | clone->bi_destructor = dm_bio_destructor; | ||
| 1081 | |||
| 1082 | __map_bio(ti, clone, tio); | ||
| 1083 | } | ||
| 1084 | |||
| 1085 | static int __clone_and_map_empty_barrier(struct clone_info *ci) | ||
| 1086 | { | ||
| 1087 | unsigned target_nr = 0, flush_nr; | ||
| 1088 | struct dm_target *ti; | ||
| 1089 | |||
| 1090 | while ((ti = dm_table_get_target(ci->map, target_nr++))) | ||
| 1091 | for (flush_nr = 0; flush_nr < ti->num_flush_requests; | ||
| 1092 | flush_nr++) | ||
| 1093 | __flush_target(ci, ti, flush_nr); | ||
| 1094 | |||
| 1095 | ci->sector_count = 0; | ||
| 1096 | |||
| 1097 | return 0; | ||
| 1098 | } | ||
| 1099 | |||
| 758 | static int __clone_and_map(struct clone_info *ci) | 1100 | static int __clone_and_map(struct clone_info *ci) |
| 759 | { | 1101 | { |
| 760 | struct bio *clone, *bio = ci->bio; | 1102 | struct bio *clone, *bio = ci->bio; |
| @@ -762,6 +1104,9 @@ static int __clone_and_map(struct clone_info *ci) | |||
| 762 | sector_t len = 0, max; | 1104 | sector_t len = 0, max; |
| 763 | struct dm_target_io *tio; | 1105 | struct dm_target_io *tio; |
| 764 | 1106 | ||
| 1107 | if (unlikely(bio_empty_barrier(bio))) | ||
| 1108 | return __clone_and_map_empty_barrier(ci); | ||
| 1109 | |||
| 765 | ti = dm_table_find_target(ci->map, ci->sector); | 1110 | ti = dm_table_find_target(ci->map, ci->sector); |
| 766 | if (!dm_target_is_valid(ti)) | 1111 | if (!dm_target_is_valid(ti)) |
| 767 | return -EIO; | 1112 | return -EIO; |
| @@ -771,10 +1116,7 @@ static int __clone_and_map(struct clone_info *ci) | |||
| 771 | /* | 1116 | /* |
| 772 | * Allocate a target io object. | 1117 | * Allocate a target io object. |
| 773 | */ | 1118 | */ |
| 774 | tio = alloc_tio(ci->md); | 1119 | tio = alloc_tio(ci, ti); |
| 775 | tio->io = ci->io; | ||
| 776 | tio->ti = ti; | ||
| 777 | memset(&tio->info, 0, sizeof(tio->info)); | ||
| 778 | 1120 | ||
| 779 | if (ci->sector_count <= max) { | 1121 | if (ci->sector_count <= max) { |
| 780 | /* | 1122 | /* |
| @@ -830,10 +1172,7 @@ static int __clone_and_map(struct clone_info *ci) | |||
| 830 | 1172 | ||
| 831 | max = max_io_len(ci->md, ci->sector, ti); | 1173 | max = max_io_len(ci->md, ci->sector, ti); |
| 832 | 1174 | ||
| 833 | tio = alloc_tio(ci->md); | 1175 | tio = alloc_tio(ci, ti); |
| 834 | tio->io = ci->io; | ||
| 835 | tio->ti = ti; | ||
| 836 | memset(&tio->info, 0, sizeof(tio->info)); | ||
| 837 | } | 1176 | } |
| 838 | 1177 | ||
| 839 | len = min(remaining, max); | 1178 | len = min(remaining, max); |
| @@ -868,7 +1207,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
| 868 | if (!bio_barrier(bio)) | 1207 | if (!bio_barrier(bio)) |
| 869 | bio_io_error(bio); | 1208 | bio_io_error(bio); |
| 870 | else | 1209 | else |
| 871 | md->barrier_error = -EIO; | 1210 | if (!md->barrier_error) |
| 1211 | md->barrier_error = -EIO; | ||
| 872 | return; | 1212 | return; |
| 873 | } | 1213 | } |
| 874 | 1214 | ||
| @@ -881,6 +1221,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
| 881 | ci.io->md = md; | 1221 | ci.io->md = md; |
| 882 | ci.sector = bio->bi_sector; | 1222 | ci.sector = bio->bi_sector; |
| 883 | ci.sector_count = bio_sectors(bio); | 1223 | ci.sector_count = bio_sectors(bio); |
| 1224 | if (unlikely(bio_empty_barrier(bio))) | ||
| 1225 | ci.sector_count = 1; | ||
| 884 | ci.idx = bio->bi_idx; | 1226 | ci.idx = bio->bi_idx; |
| 885 | 1227 | ||
| 886 | start_io_acct(ci.io); | 1228 | start_io_acct(ci.io); |
| @@ -928,6 +1270,16 @@ static int dm_merge_bvec(struct request_queue *q, | |||
| 928 | */ | 1270 | */ |
| 929 | if (max_size && ti->type->merge) | 1271 | if (max_size && ti->type->merge) |
| 930 | max_size = ti->type->merge(ti, bvm, biovec, max_size); | 1272 | max_size = ti->type->merge(ti, bvm, biovec, max_size); |
| 1273 | /* | ||
| 1274 | * If the target doesn't support merge method and some of the devices | ||
| 1275 | * provided their merge_bvec method (we know this by looking at | ||
| 1276 | * queue_max_hw_sectors), then we can't allow bios with multiple vector | ||
| 1277 | * entries. So always set max_size to 0, and the code below allows | ||
| 1278 | * just one page. | ||
| 1279 | */ | ||
| 1280 | else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) | ||
| 1281 | |||
| 1282 | max_size = 0; | ||
| 931 | 1283 | ||
| 932 | out_table: | 1284 | out_table: |
| 933 | dm_table_put(map); | 1285 | dm_table_put(map); |
| @@ -946,7 +1298,7 @@ out: | |||
| 946 | * The request function that just remaps the bio built up by | 1298 | * The request function that just remaps the bio built up by |
| 947 | * dm_merge_bvec. | 1299 | * dm_merge_bvec. |
| 948 | */ | 1300 | */ |
| 949 | static int dm_request(struct request_queue *q, struct bio *bio) | 1301 | static int _dm_request(struct request_queue *q, struct bio *bio) |
| 950 | { | 1302 | { |
| 951 | int rw = bio_data_dir(bio); | 1303 | int rw = bio_data_dir(bio); |
| 952 | struct mapped_device *md = q->queuedata; | 1304 | struct mapped_device *md = q->queuedata; |
| @@ -983,12 +1335,274 @@ static int dm_request(struct request_queue *q, struct bio *bio) | |||
| 983 | return 0; | 1335 | return 0; |
| 984 | } | 1336 | } |
| 985 | 1337 | ||
| 1338 | static int dm_make_request(struct request_queue *q, struct bio *bio) | ||
| 1339 | { | ||
| 1340 | struct mapped_device *md = q->queuedata; | ||
| 1341 | |||
| 1342 | if (unlikely(bio_barrier(bio))) { | ||
| 1343 | bio_endio(bio, -EOPNOTSUPP); | ||
| 1344 | return 0; | ||
| 1345 | } | ||
| 1346 | |||
| 1347 | return md->saved_make_request_fn(q, bio); /* call __make_request() */ | ||
| 1348 | } | ||
| 1349 | |||
| 1350 | static int dm_request_based(struct mapped_device *md) | ||
| 1351 | { | ||
| 1352 | return blk_queue_stackable(md->queue); | ||
| 1353 | } | ||
| 1354 | |||
| 1355 | static int dm_request(struct request_queue *q, struct bio *bio) | ||
| 1356 | { | ||
| 1357 | struct mapped_device *md = q->queuedata; | ||
| 1358 | |||
| 1359 | if (dm_request_based(md)) | ||
| 1360 | return dm_make_request(q, bio); | ||
| 1361 | |||
| 1362 | return _dm_request(q, bio); | ||
| 1363 | } | ||
| 1364 | |||
| 1365 | void dm_dispatch_request(struct request *rq) | ||
| 1366 | { | ||
| 1367 | int r; | ||
| 1368 | |||
| 1369 | if (blk_queue_io_stat(rq->q)) | ||
| 1370 | rq->cmd_flags |= REQ_IO_STAT; | ||
| 1371 | |||
| 1372 | rq->start_time = jiffies; | ||
| 1373 | r = blk_insert_cloned_request(rq->q, rq); | ||
| 1374 | if (r) | ||
| 1375 | dm_complete_request(rq, r); | ||
| 1376 | } | ||
| 1377 | EXPORT_SYMBOL_GPL(dm_dispatch_request); | ||
| 1378 | |||
| 1379 | static void dm_rq_bio_destructor(struct bio *bio) | ||
| 1380 | { | ||
| 1381 | struct dm_rq_clone_bio_info *info = bio->bi_private; | ||
| 1382 | struct mapped_device *md = info->tio->md; | ||
| 1383 | |||
| 1384 | free_bio_info(info); | ||
| 1385 | bio_free(bio, md->bs); | ||
| 1386 | } | ||
| 1387 | |||
| 1388 | static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, | ||
| 1389 | void *data) | ||
| 1390 | { | ||
| 1391 | struct dm_rq_target_io *tio = data; | ||
| 1392 | struct mapped_device *md = tio->md; | ||
| 1393 | struct dm_rq_clone_bio_info *info = alloc_bio_info(md); | ||
| 1394 | |||
| 1395 | if (!info) | ||
| 1396 | return -ENOMEM; | ||
| 1397 | |||
| 1398 | info->orig = bio_orig; | ||
| 1399 | info->tio = tio; | ||
| 1400 | bio->bi_end_io = end_clone_bio; | ||
| 1401 | bio->bi_private = info; | ||
| 1402 | bio->bi_destructor = dm_rq_bio_destructor; | ||
| 1403 | |||
| 1404 | return 0; | ||
| 1405 | } | ||
| 1406 | |||
| 1407 | static int setup_clone(struct request *clone, struct request *rq, | ||
| 1408 | struct dm_rq_target_io *tio) | ||
| 1409 | { | ||
| 1410 | int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, | ||
| 1411 | dm_rq_bio_constructor, tio); | ||
| 1412 | |||
| 1413 | if (r) | ||
| 1414 | return r; | ||
| 1415 | |||
| 1416 | clone->cmd = rq->cmd; | ||
| 1417 | clone->cmd_len = rq->cmd_len; | ||
| 1418 | clone->sense = rq->sense; | ||
| 1419 | clone->buffer = rq->buffer; | ||
| 1420 | clone->end_io = end_clone_request; | ||
| 1421 | clone->end_io_data = tio; | ||
| 1422 | |||
| 1423 | return 0; | ||
| 1424 | } | ||
| 1425 | |||
| 1426 | static int dm_rq_flush_suspending(struct mapped_device *md) | ||
| 1427 | { | ||
| 1428 | return !md->suspend_rq.special; | ||
| 1429 | } | ||
| 1430 | |||
| 1431 | /* | ||
| 1432 | * Called with the queue lock held. | ||
| 1433 | */ | ||
| 1434 | static int dm_prep_fn(struct request_queue *q, struct request *rq) | ||
| 1435 | { | ||
| 1436 | struct mapped_device *md = q->queuedata; | ||
| 1437 | struct dm_rq_target_io *tio; | ||
| 1438 | struct request *clone; | ||
| 1439 | |||
| 1440 | if (unlikely(rq == &md->suspend_rq)) { | ||
| 1441 | if (dm_rq_flush_suspending(md)) | ||
| 1442 | return BLKPREP_OK; | ||
| 1443 | else | ||
| 1444 | /* The flush suspend was interrupted */ | ||
| 1445 | return BLKPREP_KILL; | ||
| 1446 | } | ||
| 1447 | |||
| 1448 | if (unlikely(rq->special)) { | ||
| 1449 | DMWARN("Already has something in rq->special."); | ||
| 1450 | return BLKPREP_KILL; | ||
| 1451 | } | ||
| 1452 | |||
| 1453 | tio = alloc_rq_tio(md); /* Only one for each original request */ | ||
| 1454 | if (!tio) | ||
| 1455 | /* -ENOMEM */ | ||
| 1456 | return BLKPREP_DEFER; | ||
| 1457 | |||
| 1458 | tio->md = md; | ||
| 1459 | tio->ti = NULL; | ||
| 1460 | tio->orig = rq; | ||
| 1461 | tio->error = 0; | ||
| 1462 | memset(&tio->info, 0, sizeof(tio->info)); | ||
| 1463 | |||
| 1464 | clone = &tio->clone; | ||
| 1465 | if (setup_clone(clone, rq, tio)) { | ||
| 1466 | /* -ENOMEM */ | ||
| 1467 | free_rq_tio(tio); | ||
| 1468 | return BLKPREP_DEFER; | ||
| 1469 | } | ||
| 1470 | |||
| 1471 | rq->special = clone; | ||
| 1472 | rq->cmd_flags |= REQ_DONTPREP; | ||
| 1473 | |||
| 1474 | return BLKPREP_OK; | ||
| 1475 | } | ||
| 1476 | |||
| 1477 | static void map_request(struct dm_target *ti, struct request *rq, | ||
| 1478 | struct mapped_device *md) | ||
| 1479 | { | ||
| 1480 | int r; | ||
| 1481 | struct request *clone = rq->special; | ||
| 1482 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 1483 | |||
| 1484 | /* | ||
| 1485 | * Hold the md reference here for the in-flight I/O. | ||
| 1486 | * We can't rely on the reference count by device opener, | ||
| 1487 | * because the device may be closed during the request completion | ||
| 1488 | * when all bios are completed. | ||
| 1489 | * See the comment in rq_completed() too. | ||
| 1490 | */ | ||
| 1491 | dm_get(md); | ||
| 1492 | |||
| 1493 | tio->ti = ti; | ||
| 1494 | r = ti->type->map_rq(ti, clone, &tio->info); | ||
| 1495 | switch (r) { | ||
| 1496 | case DM_MAPIO_SUBMITTED: | ||
| 1497 | /* The target has taken the I/O to submit by itself later */ | ||
| 1498 | break; | ||
| 1499 | case DM_MAPIO_REMAPPED: | ||
| 1500 | /* The target has remapped the I/O so dispatch it */ | ||
| 1501 | dm_dispatch_request(clone); | ||
| 1502 | break; | ||
| 1503 | case DM_MAPIO_REQUEUE: | ||
| 1504 | /* The target wants to requeue the I/O */ | ||
| 1505 | dm_requeue_unmapped_request(clone); | ||
| 1506 | break; | ||
| 1507 | default: | ||
| 1508 | if (r > 0) { | ||
| 1509 | DMWARN("unimplemented target map return value: %d", r); | ||
| 1510 | BUG(); | ||
| 1511 | } | ||
| 1512 | |||
| 1513 | /* The target wants to complete the I/O */ | ||
| 1514 | dm_kill_unmapped_request(clone, r); | ||
| 1515 | break; | ||
| 1516 | } | ||
| 1517 | } | ||
| 1518 | |||
| 1519 | /* | ||
| 1520 | * q->request_fn for request-based dm. | ||
| 1521 | * Called with the queue lock held. | ||
| 1522 | */ | ||
| 1523 | static void dm_request_fn(struct request_queue *q) | ||
| 1524 | { | ||
| 1525 | struct mapped_device *md = q->queuedata; | ||
| 1526 | struct dm_table *map = dm_get_table(md); | ||
| 1527 | struct dm_target *ti; | ||
| 1528 | struct request *rq; | ||
| 1529 | |||
| 1530 | /* | ||
| 1531 | * For noflush suspend, check blk_queue_stopped() to immediately | ||
| 1532 | * quit I/O dispatching. | ||
| 1533 | */ | ||
| 1534 | while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { | ||
| 1535 | rq = blk_peek_request(q); | ||
| 1536 | if (!rq) | ||
| 1537 | goto plug_and_out; | ||
| 1538 | |||
| 1539 | if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ | ||
| 1540 | if (queue_in_flight(q)) | ||
| 1541 | /* Not quiet yet. Wait more */ | ||
| 1542 | goto plug_and_out; | ||
| 1543 | |||
| 1544 | /* This device should be quiet now */ | ||
| 1545 | __stop_queue(q); | ||
| 1546 | blk_start_request(rq); | ||
| 1547 | __blk_end_request_all(rq, 0); | ||
| 1548 | wake_up(&md->wait); | ||
| 1549 | goto out; | ||
| 1550 | } | ||
| 1551 | |||
| 1552 | ti = dm_table_find_target(map, blk_rq_pos(rq)); | ||
| 1553 | if (ti->type->busy && ti->type->busy(ti)) | ||
| 1554 | goto plug_and_out; | ||
| 1555 | |||
| 1556 | blk_start_request(rq); | ||
| 1557 | spin_unlock(q->queue_lock); | ||
| 1558 | map_request(ti, rq, md); | ||
| 1559 | spin_lock_irq(q->queue_lock); | ||
| 1560 | } | ||
| 1561 | |||
| 1562 | goto out; | ||
| 1563 | |||
| 1564 | plug_and_out: | ||
| 1565 | if (!elv_queue_empty(q)) | ||
| 1566 | /* Some requests still remain, retry later */ | ||
| 1567 | blk_plug_device(q); | ||
| 1568 | |||
| 1569 | out: | ||
| 1570 | dm_table_put(map); | ||
| 1571 | |||
| 1572 | return; | ||
| 1573 | } | ||
| 1574 | |||
| 1575 | int dm_underlying_device_busy(struct request_queue *q) | ||
| 1576 | { | ||
| 1577 | return blk_lld_busy(q); | ||
| 1578 | } | ||
| 1579 | EXPORT_SYMBOL_GPL(dm_underlying_device_busy); | ||
| 1580 | |||
| 1581 | static int dm_lld_busy(struct request_queue *q) | ||
| 1582 | { | ||
| 1583 | int r; | ||
| 1584 | struct mapped_device *md = q->queuedata; | ||
| 1585 | struct dm_table *map = dm_get_table(md); | ||
| 1586 | |||
| 1587 | if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) | ||
| 1588 | r = 1; | ||
| 1589 | else | ||
| 1590 | r = dm_table_any_busy_target(map); | ||
| 1591 | |||
| 1592 | dm_table_put(map); | ||
| 1593 | |||
| 1594 | return r; | ||
| 1595 | } | ||
| 1596 | |||
| 986 | static void dm_unplug_all(struct request_queue *q) | 1597 | static void dm_unplug_all(struct request_queue *q) |
| 987 | { | 1598 | { |
| 988 | struct mapped_device *md = q->queuedata; | 1599 | struct mapped_device *md = q->queuedata; |
| 989 | struct dm_table *map = dm_get_table(md); | 1600 | struct dm_table *map = dm_get_table(md); |
| 990 | 1601 | ||
| 991 | if (map) { | 1602 | if (map) { |
| 1603 | if (dm_request_based(md)) | ||
| 1604 | generic_unplug_device(q); | ||
| 1605 | |||
| 992 | dm_table_unplug_all(map); | 1606 | dm_table_unplug_all(map); |
| 993 | dm_table_put(map); | 1607 | dm_table_put(map); |
| 994 | } | 1608 | } |
| @@ -1003,7 +1617,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits) | |||
| 1003 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { | 1617 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { |
| 1004 | map = dm_get_table(md); | 1618 | map = dm_get_table(md); |
| 1005 | if (map) { | 1619 | if (map) { |
| 1006 | r = dm_table_any_congested(map, bdi_bits); | 1620 | /* |
| 1621 | * Request-based dm cares about only own queue for | ||
| 1622 | * the query about congestion status of request_queue | ||
| 1623 | */ | ||
| 1624 | if (dm_request_based(md)) | ||
| 1625 | r = md->queue->backing_dev_info.state & | ||
| 1626 | bdi_bits; | ||
| 1627 | else | ||
| 1628 | r = dm_table_any_congested(map, bdi_bits); | ||
| 1629 | |||
| 1007 | dm_table_put(map); | 1630 | dm_table_put(map); |
| 1008 | } | 1631 | } |
| 1009 | } | 1632 | } |
| @@ -1126,30 +1749,32 @@ static struct mapped_device *alloc_dev(int minor) | |||
| 1126 | INIT_LIST_HEAD(&md->uevent_list); | 1749 | INIT_LIST_HEAD(&md->uevent_list); |
| 1127 | spin_lock_init(&md->uevent_lock); | 1750 | spin_lock_init(&md->uevent_lock); |
| 1128 | 1751 | ||
| 1129 | md->queue = blk_alloc_queue(GFP_KERNEL); | 1752 | md->queue = blk_init_queue(dm_request_fn, NULL); |
| 1130 | if (!md->queue) | 1753 | if (!md->queue) |
| 1131 | goto bad_queue; | 1754 | goto bad_queue; |
| 1132 | 1755 | ||
| 1756 | /* | ||
| 1757 | * Request-based dm devices cannot be stacked on top of bio-based dm | ||
| 1758 | * devices. The type of this dm device has not been decided yet, | ||
| 1759 | * although we initialized the queue using blk_init_queue(). | ||
| 1760 | * The type is decided at the first table loading time. | ||
| 1761 | * To prevent problematic device stacking, clear the queue flag | ||
| 1762 | * for request stacking support until then. | ||
| 1763 | * | ||
| 1764 | * This queue is new, so no concurrency on the queue_flags. | ||
| 1765 | */ | ||
| 1766 | queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); | ||
| 1767 | md->saved_make_request_fn = md->queue->make_request_fn; | ||
| 1133 | md->queue->queuedata = md; | 1768 | md->queue->queuedata = md; |
| 1134 | md->queue->backing_dev_info.congested_fn = dm_any_congested; | 1769 | md->queue->backing_dev_info.congested_fn = dm_any_congested; |
| 1135 | md->queue->backing_dev_info.congested_data = md; | 1770 | md->queue->backing_dev_info.congested_data = md; |
| 1136 | blk_queue_make_request(md->queue, dm_request); | 1771 | blk_queue_make_request(md->queue, dm_request); |
| 1137 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); | ||
| 1138 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); | 1772 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); |
| 1139 | md->queue->unplug_fn = dm_unplug_all; | 1773 | md->queue->unplug_fn = dm_unplug_all; |
| 1140 | blk_queue_merge_bvec(md->queue, dm_merge_bvec); | 1774 | blk_queue_merge_bvec(md->queue, dm_merge_bvec); |
| 1141 | 1775 | blk_queue_softirq_done(md->queue, dm_softirq_done); | |
| 1142 | md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); | 1776 | blk_queue_prep_rq(md->queue, dm_prep_fn); |
| 1143 | if (!md->io_pool) | 1777 | blk_queue_lld_busy(md->queue, dm_lld_busy); |
| 1144 | goto bad_io_pool; | ||
| 1145 | |||
| 1146 | md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); | ||
| 1147 | if (!md->tio_pool) | ||
| 1148 | goto bad_tio_pool; | ||
| 1149 | |||
| 1150 | md->bs = bioset_create(16, 0); | ||
| 1151 | if (!md->bs) | ||
| 1152 | goto bad_no_bioset; | ||
| 1153 | 1778 | ||
| 1154 | md->disk = alloc_disk(1); | 1779 | md->disk = alloc_disk(1); |
| 1155 | if (!md->disk) | 1780 | if (!md->disk) |
| @@ -1173,6 +1798,10 @@ static struct mapped_device *alloc_dev(int minor) | |||
| 1173 | if (!md->wq) | 1798 | if (!md->wq) |
| 1174 | goto bad_thread; | 1799 | goto bad_thread; |
| 1175 | 1800 | ||
| 1801 | md->bdev = bdget_disk(md->disk, 0); | ||
| 1802 | if (!md->bdev) | ||
| 1803 | goto bad_bdev; | ||
| 1804 | |||
| 1176 | /* Populate the mapping, nobody knows we exist yet */ | 1805 | /* Populate the mapping, nobody knows we exist yet */ |
| 1177 | spin_lock(&_minor_lock); | 1806 | spin_lock(&_minor_lock); |
| 1178 | old_md = idr_replace(&_minor_idr, md, minor); | 1807 | old_md = idr_replace(&_minor_idr, md, minor); |
| @@ -1182,15 +1811,11 @@ static struct mapped_device *alloc_dev(int minor) | |||
| 1182 | 1811 | ||
| 1183 | return md; | 1812 | return md; |
| 1184 | 1813 | ||
| 1814 | bad_bdev: | ||
| 1815 | destroy_workqueue(md->wq); | ||
| 1185 | bad_thread: | 1816 | bad_thread: |
| 1186 | put_disk(md->disk); | 1817 | put_disk(md->disk); |
| 1187 | bad_disk: | 1818 | bad_disk: |
| 1188 | bioset_free(md->bs); | ||
| 1189 | bad_no_bioset: | ||
| 1190 | mempool_destroy(md->tio_pool); | ||
| 1191 | bad_tio_pool: | ||
| 1192 | mempool_destroy(md->io_pool); | ||
| 1193 | bad_io_pool: | ||
| 1194 | blk_cleanup_queue(md->queue); | 1819 | blk_cleanup_queue(md->queue); |
| 1195 | bad_queue: | 1820 | bad_queue: |
| 1196 | free_minor(minor); | 1821 | free_minor(minor); |
| @@ -1207,14 +1832,15 @@ static void free_dev(struct mapped_device *md) | |||
| 1207 | { | 1832 | { |
| 1208 | int minor = MINOR(disk_devt(md->disk)); | 1833 | int minor = MINOR(disk_devt(md->disk)); |
| 1209 | 1834 | ||
| 1210 | if (md->suspended_bdev) { | 1835 | unlock_fs(md); |
| 1211 | unlock_fs(md); | 1836 | bdput(md->bdev); |
| 1212 | bdput(md->suspended_bdev); | ||
| 1213 | } | ||
| 1214 | destroy_workqueue(md->wq); | 1837 | destroy_workqueue(md->wq); |
| 1215 | mempool_destroy(md->tio_pool); | 1838 | if (md->tio_pool) |
| 1216 | mempool_destroy(md->io_pool); | 1839 | mempool_destroy(md->tio_pool); |
| 1217 | bioset_free(md->bs); | 1840 | if (md->io_pool) |
| 1841 | mempool_destroy(md->io_pool); | ||
| 1842 | if (md->bs) | ||
| 1843 | bioset_free(md->bs); | ||
| 1218 | blk_integrity_unregister(md->disk); | 1844 | blk_integrity_unregister(md->disk); |
| 1219 | del_gendisk(md->disk); | 1845 | del_gendisk(md->disk); |
| 1220 | free_minor(minor); | 1846 | free_minor(minor); |
| @@ -1229,6 +1855,29 @@ static void free_dev(struct mapped_device *md) | |||
| 1229 | kfree(md); | 1855 | kfree(md); |
| 1230 | } | 1856 | } |
| 1231 | 1857 | ||
| 1858 | static void __bind_mempools(struct mapped_device *md, struct dm_table *t) | ||
| 1859 | { | ||
| 1860 | struct dm_md_mempools *p; | ||
| 1861 | |||
| 1862 | if (md->io_pool && md->tio_pool && md->bs) | ||
| 1863 | /* the md already has necessary mempools */ | ||
| 1864 | goto out; | ||
| 1865 | |||
| 1866 | p = dm_table_get_md_mempools(t); | ||
| 1867 | BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); | ||
| 1868 | |||
| 1869 | md->io_pool = p->io_pool; | ||
| 1870 | p->io_pool = NULL; | ||
| 1871 | md->tio_pool = p->tio_pool; | ||
| 1872 | p->tio_pool = NULL; | ||
| 1873 | md->bs = p->bs; | ||
| 1874 | p->bs = NULL; | ||
| 1875 | |||
| 1876 | out: | ||
| 1877 | /* mempool bind completed, now no need any mempools in the table */ | ||
| 1878 | dm_table_free_md_mempools(t); | ||
| 1879 | } | ||
| 1880 | |||
| 1232 | /* | 1881 | /* |
| 1233 | * Bind a table to the device. | 1882 | * Bind a table to the device. |
| 1234 | */ | 1883 | */ |
| @@ -1252,15 +1901,17 @@ static void __set_size(struct mapped_device *md, sector_t size) | |||
| 1252 | { | 1901 | { |
| 1253 | set_capacity(md->disk, size); | 1902 | set_capacity(md->disk, size); |
| 1254 | 1903 | ||
| 1255 | mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); | 1904 | mutex_lock(&md->bdev->bd_inode->i_mutex); |
| 1256 | i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); | 1905 | i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); |
| 1257 | mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); | 1906 | mutex_unlock(&md->bdev->bd_inode->i_mutex); |
| 1258 | } | 1907 | } |
| 1259 | 1908 | ||
| 1260 | static int __bind(struct mapped_device *md, struct dm_table *t) | 1909 | static int __bind(struct mapped_device *md, struct dm_table *t, |
| 1910 | struct queue_limits *limits) | ||
| 1261 | { | 1911 | { |
| 1262 | struct request_queue *q = md->queue; | 1912 | struct request_queue *q = md->queue; |
| 1263 | sector_t size; | 1913 | sector_t size; |
| 1914 | unsigned long flags; | ||
| 1264 | 1915 | ||
| 1265 | size = dm_table_get_size(t); | 1916 | size = dm_table_get_size(t); |
| 1266 | 1917 | ||
| @@ -1270,8 +1921,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t) | |||
| 1270 | if (size != get_capacity(md->disk)) | 1921 | if (size != get_capacity(md->disk)) |
| 1271 | memset(&md->geometry, 0, sizeof(md->geometry)); | 1922 | memset(&md->geometry, 0, sizeof(md->geometry)); |
| 1272 | 1923 | ||
| 1273 | if (md->suspended_bdev) | 1924 | __set_size(md, size); |
| 1274 | __set_size(md, size); | ||
| 1275 | 1925 | ||
| 1276 | if (!size) { | 1926 | if (!size) { |
| 1277 | dm_table_destroy(t); | 1927 | dm_table_destroy(t); |
| @@ -1280,10 +1930,22 @@ static int __bind(struct mapped_device *md, struct dm_table *t) | |||
| 1280 | 1930 | ||
| 1281 | dm_table_event_callback(t, event_callback, md); | 1931 | dm_table_event_callback(t, event_callback, md); |
| 1282 | 1932 | ||
| 1283 | write_lock(&md->map_lock); | 1933 | /* |
| 1934 | * The queue hasn't been stopped yet, if the old table type wasn't | ||
| 1935 | * for request-based during suspension. So stop it to prevent | ||
| 1936 | * I/O mapping before resume. | ||
| 1937 | * This must be done before setting the queue restrictions, | ||
| 1938 | * because request-based dm may be run just after the setting. | ||
| 1939 | */ | ||
| 1940 | if (dm_table_request_based(t) && !blk_queue_stopped(q)) | ||
| 1941 | stop_queue(q); | ||
| 1942 | |||
| 1943 | __bind_mempools(md, t); | ||
| 1944 | |||
| 1945 | write_lock_irqsave(&md->map_lock, flags); | ||
| 1284 | md->map = t; | 1946 | md->map = t; |
| 1285 | dm_table_set_restrictions(t, q); | 1947 | dm_table_set_restrictions(t, q, limits); |
| 1286 | write_unlock(&md->map_lock); | 1948 | write_unlock_irqrestore(&md->map_lock, flags); |
| 1287 | 1949 | ||
| 1288 | return 0; | 1950 | return 0; |
| 1289 | } | 1951 | } |
| @@ -1291,14 +1953,15 @@ static int __bind(struct mapped_device *md, struct dm_table *t) | |||
| 1291 | static void __unbind(struct mapped_device *md) | 1953 | static void __unbind(struct mapped_device *md) |
| 1292 | { | 1954 | { |
| 1293 | struct dm_table *map = md->map; | 1955 | struct dm_table *map = md->map; |
| 1956 | unsigned long flags; | ||
| 1294 | 1957 | ||
| 1295 | if (!map) | 1958 | if (!map) |
| 1296 | return; | 1959 | return; |
| 1297 | 1960 | ||
| 1298 | dm_table_event_callback(map, NULL, NULL); | 1961 | dm_table_event_callback(map, NULL, NULL); |
| 1299 | write_lock(&md->map_lock); | 1962 | write_lock_irqsave(&md->map_lock, flags); |
| 1300 | md->map = NULL; | 1963 | md->map = NULL; |
| 1301 | write_unlock(&md->map_lock); | 1964 | write_unlock_irqrestore(&md->map_lock, flags); |
| 1302 | dm_table_destroy(map); | 1965 | dm_table_destroy(map); |
| 1303 | } | 1966 | } |
| 1304 | 1967 | ||
| @@ -1402,6 +2065,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
| 1402 | { | 2065 | { |
| 1403 | int r = 0; | 2066 | int r = 0; |
| 1404 | DECLARE_WAITQUEUE(wait, current); | 2067 | DECLARE_WAITQUEUE(wait, current); |
| 2068 | struct request_queue *q = md->queue; | ||
| 2069 | unsigned long flags; | ||
| 1405 | 2070 | ||
| 1406 | dm_unplug_all(md->queue); | 2071 | dm_unplug_all(md->queue); |
| 1407 | 2072 | ||
| @@ -1411,7 +2076,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
| 1411 | set_current_state(interruptible); | 2076 | set_current_state(interruptible); |
| 1412 | 2077 | ||
| 1413 | smp_mb(); | 2078 | smp_mb(); |
| 1414 | if (!atomic_read(&md->pending)) | 2079 | if (dm_request_based(md)) { |
| 2080 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 2081 | if (!queue_in_flight(q) && blk_queue_stopped(q)) { | ||
| 2082 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 2083 | break; | ||
| 2084 | } | ||
| 2085 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 2086 | } else if (!atomic_read(&md->pending)) | ||
| 1415 | break; | 2087 | break; |
| 1416 | 2088 | ||
| 1417 | if (interruptible == TASK_INTERRUPTIBLE && | 2089 | if (interruptible == TASK_INTERRUPTIBLE && |
| @@ -1429,34 +2101,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
| 1429 | return r; | 2101 | return r; |
| 1430 | } | 2102 | } |
| 1431 | 2103 | ||
| 1432 | static int dm_flush(struct mapped_device *md) | 2104 | static void dm_flush(struct mapped_device *md) |
| 1433 | { | 2105 | { |
| 1434 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | 2106 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); |
| 1435 | return 0; | 2107 | |
| 2108 | bio_init(&md->barrier_bio); | ||
| 2109 | md->barrier_bio.bi_bdev = md->bdev; | ||
| 2110 | md->barrier_bio.bi_rw = WRITE_BARRIER; | ||
| 2111 | __split_and_process_bio(md, &md->barrier_bio); | ||
| 2112 | |||
| 2113 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | ||
| 1436 | } | 2114 | } |
| 1437 | 2115 | ||
| 1438 | static void process_barrier(struct mapped_device *md, struct bio *bio) | 2116 | static void process_barrier(struct mapped_device *md, struct bio *bio) |
| 1439 | { | 2117 | { |
| 1440 | int error = dm_flush(md); | 2118 | md->barrier_error = 0; |
| 1441 | 2119 | ||
| 1442 | if (unlikely(error)) { | 2120 | dm_flush(md); |
| 1443 | bio_endio(bio, error); | ||
| 1444 | return; | ||
| 1445 | } | ||
| 1446 | if (bio_empty_barrier(bio)) { | ||
| 1447 | bio_endio(bio, 0); | ||
| 1448 | return; | ||
| 1449 | } | ||
| 1450 | |||
| 1451 | __split_and_process_bio(md, bio); | ||
| 1452 | 2121 | ||
| 1453 | error = dm_flush(md); | 2122 | if (!bio_empty_barrier(bio)) { |
| 1454 | 2123 | __split_and_process_bio(md, bio); | |
| 1455 | if (!error && md->barrier_error) | 2124 | dm_flush(md); |
| 1456 | error = md->barrier_error; | 2125 | } |
| 1457 | 2126 | ||
| 1458 | if (md->barrier_error != DM_ENDIO_REQUEUE) | 2127 | if (md->barrier_error != DM_ENDIO_REQUEUE) |
| 1459 | bio_endio(bio, error); | 2128 | bio_endio(bio, md->barrier_error); |
| 2129 | else { | ||
| 2130 | spin_lock_irq(&md->deferred_lock); | ||
| 2131 | bio_list_add_head(&md->deferred, bio); | ||
| 2132 | spin_unlock_irq(&md->deferred_lock); | ||
| 2133 | } | ||
| 1460 | } | 2134 | } |
| 1461 | 2135 | ||
| 1462 | /* | 2136 | /* |
| @@ -1482,10 +2156,14 @@ static void dm_wq_work(struct work_struct *work) | |||
| 1482 | 2156 | ||
| 1483 | up_write(&md->io_lock); | 2157 | up_write(&md->io_lock); |
| 1484 | 2158 | ||
| 1485 | if (bio_barrier(c)) | 2159 | if (dm_request_based(md)) |
| 1486 | process_barrier(md, c); | 2160 | generic_make_request(c); |
| 1487 | else | 2161 | else { |
| 1488 | __split_and_process_bio(md, c); | 2162 | if (bio_barrier(c)) |
| 2163 | process_barrier(md, c); | ||
| 2164 | else | ||
| 2165 | __split_and_process_bio(md, c); | ||
| 2166 | } | ||
| 1489 | 2167 | ||
| 1490 | down_write(&md->io_lock); | 2168 | down_write(&md->io_lock); |
| 1491 | } | 2169 | } |
| @@ -1505,6 +2183,7 @@ static void dm_queue_flush(struct mapped_device *md) | |||
| 1505 | */ | 2183 | */ |
| 1506 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) | 2184 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) |
| 1507 | { | 2185 | { |
| 2186 | struct queue_limits limits; | ||
| 1508 | int r = -EINVAL; | 2187 | int r = -EINVAL; |
| 1509 | 2188 | ||
| 1510 | mutex_lock(&md->suspend_lock); | 2189 | mutex_lock(&md->suspend_lock); |
| @@ -1513,19 +2192,96 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) | |||
| 1513 | if (!dm_suspended(md)) | 2192 | if (!dm_suspended(md)) |
| 1514 | goto out; | 2193 | goto out; |
| 1515 | 2194 | ||
| 1516 | /* without bdev, the device size cannot be changed */ | 2195 | r = dm_calculate_queue_limits(table, &limits); |
| 1517 | if (!md->suspended_bdev) | 2196 | if (r) |
| 1518 | if (get_capacity(md->disk) != dm_table_get_size(table)) | 2197 | goto out; |
| 1519 | goto out; | 2198 | |
| 2199 | /* cannot change the device type, once a table is bound */ | ||
| 2200 | if (md->map && | ||
| 2201 | (dm_table_get_type(md->map) != dm_table_get_type(table))) { | ||
| 2202 | DMWARN("can't change the device type after a table is bound"); | ||
| 2203 | goto out; | ||
| 2204 | } | ||
| 2205 | |||
| 2206 | /* | ||
| 2207 | * It is enought that blk_queue_ordered() is called only once when | ||
| 2208 | * the first bio-based table is bound. | ||
| 2209 | * | ||
| 2210 | * This setting should be moved to alloc_dev() when request-based dm | ||
| 2211 | * supports barrier. | ||
| 2212 | */ | ||
| 2213 | if (!md->map && dm_table_bio_based(table)) | ||
| 2214 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); | ||
| 1520 | 2215 | ||
| 1521 | __unbind(md); | 2216 | __unbind(md); |
| 1522 | r = __bind(md, table); | 2217 | r = __bind(md, table, &limits); |
| 1523 | 2218 | ||
| 1524 | out: | 2219 | out: |
| 1525 | mutex_unlock(&md->suspend_lock); | 2220 | mutex_unlock(&md->suspend_lock); |
| 1526 | return r; | 2221 | return r; |
| 1527 | } | 2222 | } |
| 1528 | 2223 | ||
| 2224 | static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) | ||
| 2225 | { | ||
| 2226 | md->suspend_rq.special = (void *)0x1; | ||
| 2227 | } | ||
| 2228 | |||
| 2229 | static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) | ||
| 2230 | { | ||
| 2231 | struct request_queue *q = md->queue; | ||
| 2232 | unsigned long flags; | ||
| 2233 | |||
| 2234 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 2235 | if (!noflush) | ||
| 2236 | dm_rq_invalidate_suspend_marker(md); | ||
| 2237 | __start_queue(q); | ||
| 2238 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 2239 | } | ||
| 2240 | |||
| 2241 | static void dm_rq_start_suspend(struct mapped_device *md, int noflush) | ||
| 2242 | { | ||
| 2243 | struct request *rq = &md->suspend_rq; | ||
| 2244 | struct request_queue *q = md->queue; | ||
| 2245 | |||
| 2246 | if (noflush) | ||
| 2247 | stop_queue(q); | ||
| 2248 | else { | ||
| 2249 | blk_rq_init(q, rq); | ||
| 2250 | blk_insert_request(q, rq, 0, NULL); | ||
| 2251 | } | ||
| 2252 | } | ||
| 2253 | |||
| 2254 | static int dm_rq_suspend_available(struct mapped_device *md, int noflush) | ||
| 2255 | { | ||
| 2256 | int r = 1; | ||
| 2257 | struct request *rq = &md->suspend_rq; | ||
| 2258 | struct request_queue *q = md->queue; | ||
| 2259 | unsigned long flags; | ||
| 2260 | |||
| 2261 | if (noflush) | ||
| 2262 | return r; | ||
| 2263 | |||
| 2264 | /* The marker must be protected by queue lock if it is in use */ | ||
| 2265 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 2266 | if (unlikely(rq->ref_count)) { | ||
| 2267 | /* | ||
| 2268 | * This can happen, when the previous flush suspend was | ||
| 2269 | * interrupted, the marker is still in the queue and | ||
| 2270 | * this flush suspend has been invoked, because we don't | ||
| 2271 | * remove the marker at the time of suspend interruption. | ||
| 2272 | * We have only one marker per mapped_device, so we can't | ||
| 2273 | * start another flush suspend while it is in use. | ||
| 2274 | */ | ||
| 2275 | BUG_ON(!rq->special); /* The marker should be invalidated */ | ||
| 2276 | DMWARN("Invalidating the previous flush suspend is still in" | ||
| 2277 | " progress. Please retry later."); | ||
| 2278 | r = 0; | ||
| 2279 | } | ||
| 2280 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 2281 | |||
| 2282 | return r; | ||
| 2283 | } | ||
| 2284 | |||
| 1529 | /* | 2285 | /* |
| 1530 | * Functions to lock and unlock any filesystem running on the | 2286 | * Functions to lock and unlock any filesystem running on the |
| 1531 | * device. | 2287 | * device. |
| @@ -1536,7 +2292,7 @@ static int lock_fs(struct mapped_device *md) | |||
| 1536 | 2292 | ||
| 1537 | WARN_ON(md->frozen_sb); | 2293 | WARN_ON(md->frozen_sb); |
| 1538 | 2294 | ||
| 1539 | md->frozen_sb = freeze_bdev(md->suspended_bdev); | 2295 | md->frozen_sb = freeze_bdev(md->bdev); |
| 1540 | if (IS_ERR(md->frozen_sb)) { | 2296 | if (IS_ERR(md->frozen_sb)) { |
| 1541 | r = PTR_ERR(md->frozen_sb); | 2297 | r = PTR_ERR(md->frozen_sb); |
| 1542 | md->frozen_sb = NULL; | 2298 | md->frozen_sb = NULL; |
| @@ -1545,9 +2301,6 @@ static int lock_fs(struct mapped_device *md) | |||
| 1545 | 2301 | ||
| 1546 | set_bit(DMF_FROZEN, &md->flags); | 2302 | set_bit(DMF_FROZEN, &md->flags); |
| 1547 | 2303 | ||
| 1548 | /* don't bdput right now, we don't want the bdev | ||
| 1549 | * to go away while it is locked. | ||
| 1550 | */ | ||
| 1551 | return 0; | 2304 | return 0; |
| 1552 | } | 2305 | } |
| 1553 | 2306 | ||
| @@ -1556,7 +2309,7 @@ static void unlock_fs(struct mapped_device *md) | |||
| 1556 | if (!test_bit(DMF_FROZEN, &md->flags)) | 2309 | if (!test_bit(DMF_FROZEN, &md->flags)) |
| 1557 | return; | 2310 | return; |
| 1558 | 2311 | ||
| 1559 | thaw_bdev(md->suspended_bdev, md->frozen_sb); | 2312 | thaw_bdev(md->bdev, md->frozen_sb); |
| 1560 | md->frozen_sb = NULL; | 2313 | md->frozen_sb = NULL; |
| 1561 | clear_bit(DMF_FROZEN, &md->flags); | 2314 | clear_bit(DMF_FROZEN, &md->flags); |
| 1562 | } | 2315 | } |
| @@ -1568,6 +2321,53 @@ static void unlock_fs(struct mapped_device *md) | |||
| 1568 | * dm_bind_table, dm_suspend must be called to flush any in | 2321 | * dm_bind_table, dm_suspend must be called to flush any in |
| 1569 | * flight bios and ensure that any further io gets deferred. | 2322 | * flight bios and ensure that any further io gets deferred. |
| 1570 | */ | 2323 | */ |
| 2324 | /* | ||
| 2325 | * Suspend mechanism in request-based dm. | ||
| 2326 | * | ||
| 2327 | * After the suspend starts, further incoming requests are kept in | ||
| 2328 | * the request_queue and deferred. | ||
| 2329 | * Remaining requests in the request_queue at the start of suspend are flushed | ||
| 2330 | * if it is flush suspend. | ||
| 2331 | * The suspend completes when the following conditions have been satisfied, | ||
| 2332 | * so wait for it: | ||
| 2333 | * 1. q->in_flight is 0 (which means no in_flight request) | ||
| 2334 | * 2. queue has been stopped (which means no request dispatching) | ||
| 2335 | * | ||
| 2336 | * | ||
| 2337 | * Noflush suspend | ||
| 2338 | * --------------- | ||
| 2339 | * Noflush suspend doesn't need to dispatch remaining requests. | ||
| 2340 | * So stop the queue immediately. Then, wait for all in_flight requests | ||
| 2341 | * to be completed or requeued. | ||
| 2342 | * | ||
| 2343 | * To abort noflush suspend, start the queue. | ||
| 2344 | * | ||
| 2345 | * | ||
| 2346 | * Flush suspend | ||
| 2347 | * ------------- | ||
| 2348 | * Flush suspend needs to dispatch remaining requests. So stop the queue | ||
| 2349 | * after the remaining requests are completed. (Requeued request must be also | ||
| 2350 | * re-dispatched and completed. Until then, we can't stop the queue.) | ||
| 2351 | * | ||
| 2352 | * During flushing the remaining requests, further incoming requests are also | ||
| 2353 | * inserted to the same queue. To distinguish which requests are to be | ||
| 2354 | * flushed, we insert a marker request to the queue at the time of starting | ||
| 2355 | * flush suspend, like a barrier. | ||
| 2356 | * The dispatching is blocked when the marker is found on the top of the queue. | ||
| 2357 | * And the queue is stopped when all in_flight requests are completed, since | ||
| 2358 | * that means the remaining requests are completely flushed. | ||
| 2359 | * Then, the marker is removed from the queue. | ||
| 2360 | * | ||
| 2361 | * To abort flush suspend, we also need to take care of the marker, not only | ||
| 2362 | * starting the queue. | ||
| 2363 | * We don't remove the marker forcibly from the queue since it's against | ||
| 2364 | * the block-layer manner. Instead, we put a invalidated mark on the marker. | ||
| 2365 | * When the invalidated marker is found on the top of the queue, it is | ||
| 2366 | * immediately removed from the queue, so it doesn't block dispatching. | ||
| 2367 | * Because we have only one marker per mapped_device, we can't start another | ||
| 2368 | * flush suspend until the invalidated marker is removed from the queue. | ||
| 2369 | * So fail and return with -EBUSY in such a case. | ||
| 2370 | */ | ||
| 1571 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | 2371 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) |
| 1572 | { | 2372 | { |
| 1573 | struct dm_table *map = NULL; | 2373 | struct dm_table *map = NULL; |
| @@ -1582,6 +2382,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
| 1582 | goto out_unlock; | 2382 | goto out_unlock; |
| 1583 | } | 2383 | } |
| 1584 | 2384 | ||
| 2385 | if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { | ||
| 2386 | r = -EBUSY; | ||
| 2387 | goto out_unlock; | ||
| 2388 | } | ||
| 2389 | |||
| 1585 | map = dm_get_table(md); | 2390 | map = dm_get_table(md); |
| 1586 | 2391 | ||
| 1587 | /* | 2392 | /* |
| @@ -1594,24 +2399,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
| 1594 | /* This does not get reverted if there's an error later. */ | 2399 | /* This does not get reverted if there's an error later. */ |
| 1595 | dm_table_presuspend_targets(map); | 2400 | dm_table_presuspend_targets(map); |
| 1596 | 2401 | ||
| 1597 | /* bdget() can stall if the pending I/Os are not flushed */ | 2402 | /* |
| 1598 | if (!noflush) { | 2403 | * Flush I/O to the device. noflush supersedes do_lockfs, |
| 1599 | md->suspended_bdev = bdget_disk(md->disk, 0); | 2404 | * because lock_fs() needs to flush I/Os. |
| 1600 | if (!md->suspended_bdev) { | 2405 | */ |
| 1601 | DMWARN("bdget failed in dm_suspend"); | 2406 | if (!noflush && do_lockfs) { |
| 1602 | r = -ENOMEM; | 2407 | r = lock_fs(md); |
| 2408 | if (r) | ||
| 1603 | goto out; | 2409 | goto out; |
| 1604 | } | ||
| 1605 | |||
| 1606 | /* | ||
| 1607 | * Flush I/O to the device. noflush supersedes do_lockfs, | ||
| 1608 | * because lock_fs() needs to flush I/Os. | ||
| 1609 | */ | ||
| 1610 | if (do_lockfs) { | ||
| 1611 | r = lock_fs(md); | ||
| 1612 | if (r) | ||
| 1613 | goto out; | ||
| 1614 | } | ||
| 1615 | } | 2410 | } |
| 1616 | 2411 | ||
| 1617 | /* | 2412 | /* |
| @@ -1637,6 +2432,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
| 1637 | 2432 | ||
| 1638 | flush_workqueue(md->wq); | 2433 | flush_workqueue(md->wq); |
| 1639 | 2434 | ||
| 2435 | if (dm_request_based(md)) | ||
| 2436 | dm_rq_start_suspend(md, noflush); | ||
| 2437 | |||
| 1640 | /* | 2438 | /* |
| 1641 | * At this point no more requests are entering target request routines. | 2439 | * At this point no more requests are entering target request routines. |
| 1642 | * We call dm_wait_for_completion to wait for all existing requests | 2440 | * We call dm_wait_for_completion to wait for all existing requests |
| @@ -1653,6 +2451,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
| 1653 | if (r < 0) { | 2451 | if (r < 0) { |
| 1654 | dm_queue_flush(md); | 2452 | dm_queue_flush(md); |
| 1655 | 2453 | ||
| 2454 | if (dm_request_based(md)) | ||
| 2455 | dm_rq_abort_suspend(md, noflush); | ||
| 2456 | |||
| 1656 | unlock_fs(md); | 2457 | unlock_fs(md); |
| 1657 | goto out; /* pushback list is already flushed, so skip flush */ | 2458 | goto out; /* pushback list is already flushed, so skip flush */ |
| 1658 | } | 2459 | } |
| @@ -1668,11 +2469,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
| 1668 | set_bit(DMF_SUSPENDED, &md->flags); | 2469 | set_bit(DMF_SUSPENDED, &md->flags); |
| 1669 | 2470 | ||
| 1670 | out: | 2471 | out: |
| 1671 | if (r && md->suspended_bdev) { | ||
| 1672 | bdput(md->suspended_bdev); | ||
| 1673 | md->suspended_bdev = NULL; | ||
| 1674 | } | ||
| 1675 | |||
| 1676 | dm_table_put(map); | 2472 | dm_table_put(map); |
| 1677 | 2473 | ||
| 1678 | out_unlock: | 2474 | out_unlock: |
| @@ -1699,21 +2495,20 @@ int dm_resume(struct mapped_device *md) | |||
| 1699 | 2495 | ||
| 1700 | dm_queue_flush(md); | 2496 | dm_queue_flush(md); |
| 1701 | 2497 | ||
| 1702 | unlock_fs(md); | 2498 | /* |
| 2499 | * Flushing deferred I/Os must be done after targets are resumed | ||
| 2500 | * so that mapping of targets can work correctly. | ||
| 2501 | * Request-based dm is queueing the deferred I/Os in its request_queue. | ||
| 2502 | */ | ||
| 2503 | if (dm_request_based(md)) | ||
| 2504 | start_queue(md->queue); | ||
| 1703 | 2505 | ||
| 1704 | if (md->suspended_bdev) { | 2506 | unlock_fs(md); |
| 1705 | bdput(md->suspended_bdev); | ||
| 1706 | md->suspended_bdev = NULL; | ||
| 1707 | } | ||
| 1708 | 2507 | ||
| 1709 | clear_bit(DMF_SUSPENDED, &md->flags); | 2508 | clear_bit(DMF_SUSPENDED, &md->flags); |
| 1710 | 2509 | ||
| 1711 | dm_table_unplug_all(map); | 2510 | dm_table_unplug_all(map); |
| 1712 | |||
| 1713 | dm_kobject_uevent(md); | ||
| 1714 | |||
| 1715 | r = 0; | 2511 | r = 0; |
| 1716 | |||
| 1717 | out: | 2512 | out: |
| 1718 | dm_table_put(map); | 2513 | dm_table_put(map); |
| 1719 | mutex_unlock(&md->suspend_lock); | 2514 | mutex_unlock(&md->suspend_lock); |
| @@ -1724,9 +2519,19 @@ out: | |||
| 1724 | /*----------------------------------------------------------------- | 2519 | /*----------------------------------------------------------------- |
| 1725 | * Event notification. | 2520 | * Event notification. |
| 1726 | *---------------------------------------------------------------*/ | 2521 | *---------------------------------------------------------------*/ |
| 1727 | void dm_kobject_uevent(struct mapped_device *md) | 2522 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
| 1728 | { | 2523 | unsigned cookie) |
| 1729 | kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE); | 2524 | { |
| 2525 | char udev_cookie[DM_COOKIE_LENGTH]; | ||
| 2526 | char *envp[] = { udev_cookie, NULL }; | ||
| 2527 | |||
| 2528 | if (!cookie) | ||
| 2529 | kobject_uevent(&disk_to_dev(md->disk)->kobj, action); | ||
| 2530 | else { | ||
| 2531 | snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", | ||
| 2532 | DM_COOKIE_ENV_VAR_NAME, cookie); | ||
| 2533 | kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); | ||
| 2534 | } | ||
| 1730 | } | 2535 | } |
| 1731 | 2536 | ||
| 1732 | uint32_t dm_next_uevent_seq(struct mapped_device *md) | 2537 | uint32_t dm_next_uevent_seq(struct mapped_device *md) |
| @@ -1780,6 +2585,10 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) | |||
| 1780 | if (&md->kobj != kobj) | 2585 | if (&md->kobj != kobj) |
| 1781 | return NULL; | 2586 | return NULL; |
| 1782 | 2587 | ||
| 2588 | if (test_bit(DMF_FREEING, &md->flags) || | ||
| 2589 | test_bit(DMF_DELETING, &md->flags)) | ||
| 2590 | return NULL; | ||
| 2591 | |||
| 1783 | dm_get(md); | 2592 | dm_get(md); |
| 1784 | return md; | 2593 | return md; |
| 1785 | } | 2594 | } |
| @@ -1800,6 +2609,61 @@ int dm_noflush_suspending(struct dm_target *ti) | |||
| 1800 | } | 2609 | } |
| 1801 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); | 2610 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); |
| 1802 | 2611 | ||
| 2612 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) | ||
| 2613 | { | ||
| 2614 | struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); | ||
| 2615 | |||
| 2616 | if (!pools) | ||
| 2617 | return NULL; | ||
| 2618 | |||
| 2619 | pools->io_pool = (type == DM_TYPE_BIO_BASED) ? | ||
| 2620 | mempool_create_slab_pool(MIN_IOS, _io_cache) : | ||
| 2621 | mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); | ||
| 2622 | if (!pools->io_pool) | ||
| 2623 | goto free_pools_and_out; | ||
| 2624 | |||
| 2625 | pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? | ||
| 2626 | mempool_create_slab_pool(MIN_IOS, _tio_cache) : | ||
| 2627 | mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); | ||
| 2628 | if (!pools->tio_pool) | ||
| 2629 | goto free_io_pool_and_out; | ||
| 2630 | |||
| 2631 | pools->bs = (type == DM_TYPE_BIO_BASED) ? | ||
| 2632 | bioset_create(16, 0) : bioset_create(MIN_IOS, 0); | ||
| 2633 | if (!pools->bs) | ||
| 2634 | goto free_tio_pool_and_out; | ||
| 2635 | |||
| 2636 | return pools; | ||
| 2637 | |||
| 2638 | free_tio_pool_and_out: | ||
| 2639 | mempool_destroy(pools->tio_pool); | ||
| 2640 | |||
| 2641 | free_io_pool_and_out: | ||
| 2642 | mempool_destroy(pools->io_pool); | ||
| 2643 | |||
| 2644 | free_pools_and_out: | ||
| 2645 | kfree(pools); | ||
| 2646 | |||
| 2647 | return NULL; | ||
| 2648 | } | ||
| 2649 | |||
| 2650 | void dm_free_md_mempools(struct dm_md_mempools *pools) | ||
| 2651 | { | ||
| 2652 | if (!pools) | ||
| 2653 | return; | ||
| 2654 | |||
| 2655 | if (pools->io_pool) | ||
| 2656 | mempool_destroy(pools->io_pool); | ||
| 2657 | |||
| 2658 | if (pools->tio_pool) | ||
| 2659 | mempool_destroy(pools->tio_pool); | ||
| 2660 | |||
| 2661 | if (pools->bs) | ||
| 2662 | bioset_free(pools->bs); | ||
| 2663 | |||
| 2664 | kfree(pools); | ||
| 2665 | } | ||
| 2666 | |||
| 1803 | static struct block_device_operations dm_blk_dops = { | 2667 | static struct block_device_operations dm_blk_dops = { |
| 1804 | .open = dm_blk_open, | 2668 | .open = dm_blk_open, |
| 1805 | .release = dm_blk_close, | 2669 | .release = dm_blk_close, |
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a31506d93e91..23278ae80f08 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
| @@ -23,6 +23,13 @@ | |||
| 23 | #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) | 23 | #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) |
| 24 | 24 | ||
| 25 | /* | 25 | /* |
| 26 | * Type of table and mapped_device's mempool | ||
| 27 | */ | ||
| 28 | #define DM_TYPE_NONE 0 | ||
| 29 | #define DM_TYPE_BIO_BASED 1 | ||
| 30 | #define DM_TYPE_REQUEST_BASED 2 | ||
| 31 | |||
| 32 | /* | ||
| 26 | * List of devices that a metadevice uses and should open/close. | 33 | * List of devices that a metadevice uses and should open/close. |
| 27 | */ | 34 | */ |
| 28 | struct dm_dev_internal { | 35 | struct dm_dev_internal { |
| @@ -32,6 +39,7 @@ struct dm_dev_internal { | |||
| 32 | }; | 39 | }; |
| 33 | 40 | ||
| 34 | struct dm_table; | 41 | struct dm_table; |
| 42 | struct dm_md_mempools; | ||
| 35 | 43 | ||
| 36 | /*----------------------------------------------------------------- | 44 | /*----------------------------------------------------------------- |
| 37 | * Internal table functions. | 45 | * Internal table functions. |
| @@ -41,18 +49,34 @@ void dm_table_event_callback(struct dm_table *t, | |||
| 41 | void (*fn)(void *), void *context); | 49 | void (*fn)(void *), void *context); |
| 42 | struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); | 50 | struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); |
| 43 | struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); | 51 | struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); |
| 44 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); | 52 | int dm_calculate_queue_limits(struct dm_table *table, |
| 53 | struct queue_limits *limits); | ||
| 54 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, | ||
| 55 | struct queue_limits *limits); | ||
| 45 | struct list_head *dm_table_get_devices(struct dm_table *t); | 56 | struct list_head *dm_table_get_devices(struct dm_table *t); |
| 46 | void dm_table_presuspend_targets(struct dm_table *t); | 57 | void dm_table_presuspend_targets(struct dm_table *t); |
| 47 | void dm_table_postsuspend_targets(struct dm_table *t); | 58 | void dm_table_postsuspend_targets(struct dm_table *t); |
| 48 | int dm_table_resume_targets(struct dm_table *t); | 59 | int dm_table_resume_targets(struct dm_table *t); |
| 49 | int dm_table_any_congested(struct dm_table *t, int bdi_bits); | 60 | int dm_table_any_congested(struct dm_table *t, int bdi_bits); |
| 61 | int dm_table_any_busy_target(struct dm_table *t); | ||
| 62 | int dm_table_set_type(struct dm_table *t); | ||
| 63 | unsigned dm_table_get_type(struct dm_table *t); | ||
| 64 | bool dm_table_bio_based(struct dm_table *t); | ||
| 65 | bool dm_table_request_based(struct dm_table *t); | ||
| 66 | int dm_table_alloc_md_mempools(struct dm_table *t); | ||
| 67 | void dm_table_free_md_mempools(struct dm_table *t); | ||
| 68 | struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); | ||
| 50 | 69 | ||
| 51 | /* | 70 | /* |
| 52 | * To check the return value from dm_table_find_target(). | 71 | * To check the return value from dm_table_find_target(). |
| 53 | */ | 72 | */ |
| 54 | #define dm_target_is_valid(t) ((t)->table) | 73 | #define dm_target_is_valid(t) ((t)->table) |
| 55 | 74 | ||
| 75 | /* | ||
| 76 | * To check whether the target type is request-based or not (bio-based). | ||
| 77 | */ | ||
| 78 | #define dm_target_request_based(t) ((t)->type->map_rq != NULL) | ||
| 79 | |||
| 56 | /*----------------------------------------------------------------- | 80 | /*----------------------------------------------------------------- |
| 57 | * A registry of target types. | 81 | * A registry of target types. |
| 58 | *---------------------------------------------------------------*/ | 82 | *---------------------------------------------------------------*/ |
| @@ -92,9 +116,16 @@ void dm_stripe_exit(void); | |||
| 92 | int dm_open_count(struct mapped_device *md); | 116 | int dm_open_count(struct mapped_device *md); |
| 93 | int dm_lock_for_deletion(struct mapped_device *md); | 117 | int dm_lock_for_deletion(struct mapped_device *md); |
| 94 | 118 | ||
| 95 | void dm_kobject_uevent(struct mapped_device *md); | 119 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
| 120 | unsigned cookie); | ||
| 96 | 121 | ||
| 97 | int dm_kcopyd_init(void); | 122 | int dm_kcopyd_init(void); |
| 98 | void dm_kcopyd_exit(void); | 123 | void dm_kcopyd_exit(void); |
| 99 | 124 | ||
| 125 | /* | ||
| 126 | * Mempool operations | ||
| 127 | */ | ||
| 128 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type); | ||
| 129 | void dm_free_md_mempools(struct dm_md_mempools *pools); | ||
| 130 | |||
| 100 | #endif | 131 | #endif |
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 8695809b24b0..87d88dbb667f 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c | |||
| @@ -255,14 +255,14 @@ static void status(struct seq_file *seq, mddev_t *mddev) | |||
| 255 | } | 255 | } |
| 256 | 256 | ||
| 257 | 257 | ||
| 258 | static int reconfig(mddev_t *mddev, int layout, int chunk_size) | 258 | static int reshape(mddev_t *mddev) |
| 259 | { | 259 | { |
| 260 | int mode = layout & ModeMask; | 260 | int mode = mddev->new_layout & ModeMask; |
| 261 | int count = layout >> ModeShift; | 261 | int count = mddev->new_layout >> ModeShift; |
| 262 | conf_t *conf = mddev->private; | 262 | conf_t *conf = mddev->private; |
| 263 | 263 | ||
| 264 | if (chunk_size != -1) | 264 | if (mddev->new_layout < 0) |
| 265 | return -EINVAL; | 265 | return 0; |
| 266 | 266 | ||
| 267 | /* new layout */ | 267 | /* new layout */ |
| 268 | if (mode == ClearFaults) | 268 | if (mode == ClearFaults) |
| @@ -279,6 +279,7 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size) | |||
| 279 | atomic_set(&conf->counters[mode], count); | 279 | atomic_set(&conf->counters[mode], count); |
| 280 | } else | 280 | } else |
| 281 | return -EINVAL; | 281 | return -EINVAL; |
| 282 | mddev->new_layout = -1; | ||
| 282 | mddev->layout = -1; /* makes sure further changes come through */ | 283 | mddev->layout = -1; /* makes sure further changes come through */ |
| 283 | return 0; | 284 | return 0; |
| 284 | } | 285 | } |
| @@ -298,8 +299,12 @@ static int run(mddev_t *mddev) | |||
| 298 | { | 299 | { |
| 299 | mdk_rdev_t *rdev; | 300 | mdk_rdev_t *rdev; |
| 300 | int i; | 301 | int i; |
| 302 | conf_t *conf; | ||
| 303 | |||
| 304 | if (md_check_no_bitmap(mddev)) | ||
| 305 | return -EINVAL; | ||
| 301 | 306 | ||
| 302 | conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL); | 307 | conf = kmalloc(sizeof(*conf), GFP_KERNEL); |
| 303 | if (!conf) | 308 | if (!conf) |
| 304 | return -ENOMEM; | 309 | return -ENOMEM; |
| 305 | 310 | ||
| @@ -315,7 +320,7 @@ static int run(mddev_t *mddev) | |||
| 315 | md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); | 320 | md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); |
| 316 | mddev->private = conf; | 321 | mddev->private = conf; |
| 317 | 322 | ||
| 318 | reconfig(mddev, mddev->layout, -1); | 323 | reshape(mddev); |
| 319 | 324 | ||
| 320 | return 0; | 325 | return 0; |
| 321 | } | 326 | } |
| @@ -338,7 +343,7 @@ static struct mdk_personality faulty_personality = | |||
| 338 | .run = run, | 343 | .run = run, |
| 339 | .stop = stop, | 344 | .stop = stop, |
| 340 | .status = status, | 345 | .status = status, |
| 341 | .reconfig = reconfig, | 346 | .check_reshape = reshape, |
| 342 | .size = faulty_size, | 347 | .size = faulty_size, |
| 343 | }; | 348 | }; |
| 344 | 349 | ||
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 7a36e38393a1..15c8b7b25a9b 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
| @@ -27,19 +27,27 @@ | |||
| 27 | */ | 27 | */ |
| 28 | static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) | 28 | static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) |
| 29 | { | 29 | { |
| 30 | dev_info_t *hash; | 30 | int lo, mid, hi; |
| 31 | linear_conf_t *conf = mddev_to_conf(mddev); | 31 | linear_conf_t *conf; |
| 32 | sector_t idx = sector >> conf->sector_shift; | 32 | |
| 33 | lo = 0; | ||
| 34 | hi = mddev->raid_disks - 1; | ||
| 35 | conf = rcu_dereference(mddev->private); | ||
| 33 | 36 | ||
| 34 | /* | 37 | /* |
| 35 | * sector_div(a,b) returns the remainer and sets a to a/b | 38 | * Binary Search |
| 36 | */ | 39 | */ |
| 37 | (void)sector_div(idx, conf->spacing); | ||
| 38 | hash = conf->hash_table[idx]; | ||
| 39 | 40 | ||
| 40 | while (sector >= hash->num_sectors + hash->start_sector) | 41 | while (hi > lo) { |
| 41 | hash++; | 42 | |
| 42 | return hash; | 43 | mid = (hi + lo) / 2; |
| 44 | if (sector < conf->disks[mid].end_sector) | ||
| 45 | hi = mid; | ||
| 46 | else | ||
| 47 | lo = mid + 1; | ||
| 48 | } | ||
| 49 | |||
| 50 | return conf->disks + lo; | ||
| 43 | } | 51 | } |
| 44 | 52 | ||
| 45 | /** | 53 | /** |
| @@ -59,8 +67,10 @@ static int linear_mergeable_bvec(struct request_queue *q, | |||
| 59 | unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; | 67 | unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; |
| 60 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 68 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
| 61 | 69 | ||
| 70 | rcu_read_lock(); | ||
| 62 | dev0 = which_dev(mddev, sector); | 71 | dev0 = which_dev(mddev, sector); |
| 63 | maxsectors = dev0->num_sectors - (sector - dev0->start_sector); | 72 | maxsectors = dev0->end_sector - sector; |
| 73 | rcu_read_unlock(); | ||
| 64 | 74 | ||
| 65 | if (maxsectors < bio_sectors) | 75 | if (maxsectors < bio_sectors) |
| 66 | maxsectors = 0; | 76 | maxsectors = 0; |
| @@ -79,46 +89,57 @@ static int linear_mergeable_bvec(struct request_queue *q, | |||
| 79 | static void linear_unplug(struct request_queue *q) | 89 | static void linear_unplug(struct request_queue *q) |
| 80 | { | 90 | { |
| 81 | mddev_t *mddev = q->queuedata; | 91 | mddev_t *mddev = q->queuedata; |
| 82 | linear_conf_t *conf = mddev_to_conf(mddev); | 92 | linear_conf_t *conf; |
| 83 | int i; | 93 | int i; |
| 84 | 94 | ||
| 95 | rcu_read_lock(); | ||
| 96 | conf = rcu_dereference(mddev->private); | ||
| 97 | |||
| 85 | for (i=0; i < mddev->raid_disks; i++) { | 98 | for (i=0; i < mddev->raid_disks; i++) { |
| 86 | struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); | 99 | struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); |
| 87 | blk_unplug(r_queue); | 100 | blk_unplug(r_queue); |
| 88 | } | 101 | } |
| 102 | rcu_read_unlock(); | ||
| 89 | } | 103 | } |
| 90 | 104 | ||
| 91 | static int linear_congested(void *data, int bits) | 105 | static int linear_congested(void *data, int bits) |
| 92 | { | 106 | { |
| 93 | mddev_t *mddev = data; | 107 | mddev_t *mddev = data; |
| 94 | linear_conf_t *conf = mddev_to_conf(mddev); | 108 | linear_conf_t *conf; |
| 95 | int i, ret = 0; | 109 | int i, ret = 0; |
| 96 | 110 | ||
| 111 | rcu_read_lock(); | ||
| 112 | conf = rcu_dereference(mddev->private); | ||
| 113 | |||
| 97 | for (i = 0; i < mddev->raid_disks && !ret ; i++) { | 114 | for (i = 0; i < mddev->raid_disks && !ret ; i++) { |
| 98 | struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); | 115 | struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); |
| 99 | ret |= bdi_congested(&q->backing_dev_info, bits); | 116 | ret |= bdi_congested(&q->backing_dev_info, bits); |
| 100 | } | 117 | } |
| 118 | |||
| 119 | rcu_read_unlock(); | ||
| 101 | return ret; | 120 | return ret; |
| 102 | } | 121 | } |
| 103 | 122 | ||
| 104 | static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) | 123 | static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) |
| 105 | { | 124 | { |
| 106 | linear_conf_t *conf = mddev_to_conf(mddev); | 125 | linear_conf_t *conf; |
| 126 | sector_t array_sectors; | ||
| 107 | 127 | ||
| 128 | rcu_read_lock(); | ||
| 129 | conf = rcu_dereference(mddev->private); | ||
| 108 | WARN_ONCE(sectors || raid_disks, | 130 | WARN_ONCE(sectors || raid_disks, |
| 109 | "%s does not support generic reshape\n", __func__); | 131 | "%s does not support generic reshape\n", __func__); |
| 132 | array_sectors = conf->array_sectors; | ||
| 133 | rcu_read_unlock(); | ||
| 110 | 134 | ||
| 111 | return conf->array_sectors; | 135 | return array_sectors; |
| 112 | } | 136 | } |
| 113 | 137 | ||
| 114 | static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | 138 | static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) |
| 115 | { | 139 | { |
| 116 | linear_conf_t *conf; | 140 | linear_conf_t *conf; |
| 117 | dev_info_t **table; | ||
| 118 | mdk_rdev_t *rdev; | 141 | mdk_rdev_t *rdev; |
| 119 | int i, nb_zone, cnt; | 142 | int i, cnt; |
| 120 | sector_t min_sectors; | ||
| 121 | sector_t curr_sector; | ||
| 122 | 143 | ||
| 123 | conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t), | 144 | conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t), |
| 124 | GFP_KERNEL); | 145 | GFP_KERNEL); |
| @@ -131,6 +152,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
| 131 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 152 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
| 132 | int j = rdev->raid_disk; | 153 | int j = rdev->raid_disk; |
| 133 | dev_info_t *disk = conf->disks + j; | 154 | dev_info_t *disk = conf->disks + j; |
| 155 | sector_t sectors; | ||
| 134 | 156 | ||
| 135 | if (j < 0 || j >= raid_disks || disk->rdev) { | 157 | if (j < 0 || j >= raid_disks || disk->rdev) { |
| 136 | printk("linear: disk numbering problem. Aborting!\n"); | 158 | printk("linear: disk numbering problem. Aborting!\n"); |
| @@ -138,6 +160,11 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
| 138 | } | 160 | } |
| 139 | 161 | ||
| 140 | disk->rdev = rdev; | 162 | disk->rdev = rdev; |
| 163 | if (mddev->chunk_sectors) { | ||
| 164 | sectors = rdev->sectors; | ||
| 165 | sector_div(sectors, mddev->chunk_sectors); | ||
| 166 | rdev->sectors = sectors * mddev->chunk_sectors; | ||
| 167 | } | ||
| 141 | 168 | ||
| 142 | blk_queue_stack_limits(mddev->queue, | 169 | blk_queue_stack_limits(mddev->queue, |
| 143 | rdev->bdev->bd_disk->queue); | 170 | rdev->bdev->bd_disk->queue); |
| @@ -146,105 +173,27 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
| 146 | * a one page request is never in violation. | 173 | * a one page request is never in violation. |
| 147 | */ | 174 | */ |
| 148 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 175 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && |
| 149 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 176 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
| 150 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 177 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
| 151 | 178 | ||
| 152 | disk->num_sectors = rdev->sectors; | ||
| 153 | conf->array_sectors += rdev->sectors; | 179 | conf->array_sectors += rdev->sectors; |
| 154 | |||
| 155 | cnt++; | 180 | cnt++; |
| 181 | |||
| 156 | } | 182 | } |
| 157 | if (cnt != raid_disks) { | 183 | if (cnt != raid_disks) { |
| 158 | printk("linear: not enough drives present. Aborting!\n"); | 184 | printk("linear: not enough drives present. Aborting!\n"); |
| 159 | goto out; | 185 | goto out; |
| 160 | } | 186 | } |
| 161 | 187 | ||
| 162 | min_sectors = conf->array_sectors; | ||
| 163 | sector_div(min_sectors, PAGE_SIZE/sizeof(struct dev_info *)); | ||
| 164 | if (min_sectors == 0) | ||
| 165 | min_sectors = 1; | ||
| 166 | |||
| 167 | /* min_sectors is the minimum spacing that will fit the hash | ||
| 168 | * table in one PAGE. This may be much smaller than needed. | ||
| 169 | * We find the smallest non-terminal set of consecutive devices | ||
| 170 | * that is larger than min_sectors and use the size of that as | ||
| 171 | * the actual spacing | ||
| 172 | */ | ||
| 173 | conf->spacing = conf->array_sectors; | ||
| 174 | for (i=0; i < cnt-1 ; i++) { | ||
| 175 | sector_t tmp = 0; | ||
| 176 | int j; | ||
| 177 | for (j = i; j < cnt - 1 && tmp < min_sectors; j++) | ||
| 178 | tmp += conf->disks[j].num_sectors; | ||
| 179 | if (tmp >= min_sectors && tmp < conf->spacing) | ||
| 180 | conf->spacing = tmp; | ||
| 181 | } | ||
| 182 | |||
| 183 | /* spacing may be too large for sector_div to work with, | ||
| 184 | * so we might need to pre-shift | ||
| 185 | */ | ||
| 186 | conf->sector_shift = 0; | ||
| 187 | if (sizeof(sector_t) > sizeof(u32)) { | ||
| 188 | sector_t space = conf->spacing; | ||
| 189 | while (space > (sector_t)(~(u32)0)) { | ||
| 190 | space >>= 1; | ||
| 191 | conf->sector_shift++; | ||
| 192 | } | ||
| 193 | } | ||
| 194 | /* | 188 | /* |
| 195 | * This code was restructured to work around a gcc-2.95.3 internal | 189 | * Here we calculate the device offsets. |
| 196 | * compiler error. Alter it with care. | ||
| 197 | */ | 190 | */ |
| 198 | { | 191 | conf->disks[0].end_sector = conf->disks[0].rdev->sectors; |
| 199 | sector_t sz; | ||
| 200 | unsigned round; | ||
| 201 | unsigned long base; | ||
| 202 | |||
| 203 | sz = conf->array_sectors >> conf->sector_shift; | ||
| 204 | sz += 1; /* force round-up */ | ||
| 205 | base = conf->spacing >> conf->sector_shift; | ||
| 206 | round = sector_div(sz, base); | ||
| 207 | nb_zone = sz + (round ? 1 : 0); | ||
| 208 | } | ||
| 209 | BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *)); | ||
| 210 | |||
| 211 | conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone, | ||
| 212 | GFP_KERNEL); | ||
| 213 | if (!conf->hash_table) | ||
| 214 | goto out; | ||
| 215 | 192 | ||
| 216 | /* | ||
| 217 | * Here we generate the linear hash table | ||
| 218 | * First calculate the device offsets. | ||
| 219 | */ | ||
| 220 | conf->disks[0].start_sector = 0; | ||
| 221 | for (i = 1; i < raid_disks; i++) | 193 | for (i = 1; i < raid_disks; i++) |
| 222 | conf->disks[i].start_sector = | 194 | conf->disks[i].end_sector = |
| 223 | conf->disks[i-1].start_sector + | 195 | conf->disks[i-1].end_sector + |
| 224 | conf->disks[i-1].num_sectors; | 196 | conf->disks[i].rdev->sectors; |
| 225 | |||
| 226 | table = conf->hash_table; | ||
| 227 | i = 0; | ||
| 228 | for (curr_sector = 0; | ||
| 229 | curr_sector < conf->array_sectors; | ||
| 230 | curr_sector += conf->spacing) { | ||
| 231 | |||
| 232 | while (i < raid_disks-1 && | ||
| 233 | curr_sector >= conf->disks[i+1].start_sector) | ||
| 234 | i++; | ||
| 235 | |||
| 236 | *table ++ = conf->disks + i; | ||
| 237 | } | ||
| 238 | |||
| 239 | if (conf->sector_shift) { | ||
| 240 | conf->spacing >>= conf->sector_shift; | ||
| 241 | /* round spacing up so that when we divide by it, | ||
| 242 | * we err on the side of "too-low", which is safest. | ||
| 243 | */ | ||
| 244 | conf->spacing++; | ||
| 245 | } | ||
| 246 | |||
| 247 | BUG_ON(table - conf->hash_table > nb_zone); | ||
| 248 | 197 | ||
| 249 | return conf; | 198 | return conf; |
| 250 | 199 | ||
| @@ -257,6 +206,8 @@ static int linear_run (mddev_t *mddev) | |||
| 257 | { | 206 | { |
| 258 | linear_conf_t *conf; | 207 | linear_conf_t *conf; |
| 259 | 208 | ||
| 209 | if (md_check_no_bitmap(mddev)) | ||
| 210 | return -EINVAL; | ||
| 260 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; | 211 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; |
| 261 | conf = linear_conf(mddev, mddev->raid_disks); | 212 | conf = linear_conf(mddev, mddev->raid_disks); |
| 262 | 213 | ||
| @@ -272,6 +223,12 @@ static int linear_run (mddev_t *mddev) | |||
| 272 | return 0; | 223 | return 0; |
| 273 | } | 224 | } |
| 274 | 225 | ||
| 226 | static void free_conf(struct rcu_head *head) | ||
| 227 | { | ||
| 228 | linear_conf_t *conf = container_of(head, linear_conf_t, rcu); | ||
| 229 | kfree(conf); | ||
| 230 | } | ||
| 231 | |||
| 275 | static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) | 232 | static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) |
| 276 | { | 233 | { |
| 277 | /* Adding a drive to a linear array allows the array to grow. | 234 | /* Adding a drive to a linear array allows the array to grow. |
| @@ -282,7 +239,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 282 | * The current one is never freed until the array is stopped. | 239 | * The current one is never freed until the array is stopped. |
| 283 | * This avoids races. | 240 | * This avoids races. |
| 284 | */ | 241 | */ |
| 285 | linear_conf_t *newconf; | 242 | linear_conf_t *newconf, *oldconf; |
| 286 | 243 | ||
| 287 | if (rdev->saved_raid_disk != mddev->raid_disks) | 244 | if (rdev->saved_raid_disk != mddev->raid_disks) |
| 288 | return -EINVAL; | 245 | return -EINVAL; |
| @@ -294,25 +251,29 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 294 | if (!newconf) | 251 | if (!newconf) |
| 295 | return -ENOMEM; | 252 | return -ENOMEM; |
| 296 | 253 | ||
| 297 | newconf->prev = mddev_to_conf(mddev); | 254 | oldconf = rcu_dereference(mddev->private); |
| 298 | mddev->private = newconf; | ||
| 299 | mddev->raid_disks++; | 255 | mddev->raid_disks++; |
| 256 | rcu_assign_pointer(mddev->private, newconf); | ||
| 300 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); | 257 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); |
| 301 | set_capacity(mddev->gendisk, mddev->array_sectors); | 258 | set_capacity(mddev->gendisk, mddev->array_sectors); |
| 259 | call_rcu(&oldconf->rcu, free_conf); | ||
| 302 | return 0; | 260 | return 0; |
| 303 | } | 261 | } |
| 304 | 262 | ||
| 305 | static int linear_stop (mddev_t *mddev) | 263 | static int linear_stop (mddev_t *mddev) |
| 306 | { | 264 | { |
| 307 | linear_conf_t *conf = mddev_to_conf(mddev); | 265 | linear_conf_t *conf = mddev->private; |
| 308 | 266 | ||
| 267 | /* | ||
| 268 | * We do not require rcu protection here since | ||
| 269 | * we hold reconfig_mutex for both linear_add and | ||
| 270 | * linear_stop, so they cannot race. | ||
| 271 | * We should make sure any old 'conf's are properly | ||
| 272 | * freed though. | ||
| 273 | */ | ||
| 274 | rcu_barrier(); | ||
| 309 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 275 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
| 310 | do { | 276 | kfree(conf); |
| 311 | linear_conf_t *t = conf->prev; | ||
| 312 | kfree(conf->hash_table); | ||
| 313 | kfree(conf); | ||
| 314 | conf = t; | ||
| 315 | } while (conf); | ||
| 316 | 277 | ||
| 317 | return 0; | 278 | return 0; |
| 318 | } | 279 | } |
| @@ -322,6 +283,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
| 322 | const int rw = bio_data_dir(bio); | 283 | const int rw = bio_data_dir(bio); |
| 323 | mddev_t *mddev = q->queuedata; | 284 | mddev_t *mddev = q->queuedata; |
| 324 | dev_info_t *tmp_dev; | 285 | dev_info_t *tmp_dev; |
| 286 | sector_t start_sector; | ||
| 325 | int cpu; | 287 | int cpu; |
| 326 | 288 | ||
| 327 | if (unlikely(bio_barrier(bio))) { | 289 | if (unlikely(bio_barrier(bio))) { |
| @@ -335,33 +297,36 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
| 335 | bio_sectors(bio)); | 297 | bio_sectors(bio)); |
| 336 | part_stat_unlock(); | 298 | part_stat_unlock(); |
| 337 | 299 | ||
| 300 | rcu_read_lock(); | ||
| 338 | tmp_dev = which_dev(mddev, bio->bi_sector); | 301 | tmp_dev = which_dev(mddev, bio->bi_sector); |
| 339 | 302 | start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; | |
| 340 | if (unlikely(bio->bi_sector >= (tmp_dev->num_sectors + | 303 | |
| 341 | tmp_dev->start_sector) | 304 | |
| 342 | || (bio->bi_sector < | 305 | if (unlikely(bio->bi_sector >= (tmp_dev->end_sector) |
| 343 | tmp_dev->start_sector))) { | 306 | || (bio->bi_sector < start_sector))) { |
| 344 | char b[BDEVNAME_SIZE]; | 307 | char b[BDEVNAME_SIZE]; |
| 345 | 308 | ||
| 346 | printk("linear_make_request: Sector %llu out of bounds on " | 309 | printk("linear_make_request: Sector %llu out of bounds on " |
| 347 | "dev %s: %llu sectors, offset %llu\n", | 310 | "dev %s: %llu sectors, offset %llu\n", |
| 348 | (unsigned long long)bio->bi_sector, | 311 | (unsigned long long)bio->bi_sector, |
| 349 | bdevname(tmp_dev->rdev->bdev, b), | 312 | bdevname(tmp_dev->rdev->bdev, b), |
| 350 | (unsigned long long)tmp_dev->num_sectors, | 313 | (unsigned long long)tmp_dev->rdev->sectors, |
| 351 | (unsigned long long)tmp_dev->start_sector); | 314 | (unsigned long long)start_sector); |
| 315 | rcu_read_unlock(); | ||
| 352 | bio_io_error(bio); | 316 | bio_io_error(bio); |
| 353 | return 0; | 317 | return 0; |
| 354 | } | 318 | } |
| 355 | if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > | 319 | if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > |
| 356 | tmp_dev->start_sector + tmp_dev->num_sectors)) { | 320 | tmp_dev->end_sector)) { |
| 357 | /* This bio crosses a device boundary, so we have to | 321 | /* This bio crosses a device boundary, so we have to |
| 358 | * split it. | 322 | * split it. |
| 359 | */ | 323 | */ |
| 360 | struct bio_pair *bp; | 324 | struct bio_pair *bp; |
| 325 | sector_t end_sector = tmp_dev->end_sector; | ||
| 326 | |||
| 327 | rcu_read_unlock(); | ||
| 361 | 328 | ||
| 362 | bp = bio_split(bio, | 329 | bp = bio_split(bio, end_sector - bio->bi_sector); |
| 363 | tmp_dev->start_sector + tmp_dev->num_sectors | ||
| 364 | - bio->bi_sector); | ||
| 365 | 330 | ||
| 366 | if (linear_make_request(q, &bp->bio1)) | 331 | if (linear_make_request(q, &bp->bio1)) |
| 367 | generic_make_request(&bp->bio1); | 332 | generic_make_request(&bp->bio1); |
| @@ -372,8 +337,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
| 372 | } | 337 | } |
| 373 | 338 | ||
| 374 | bio->bi_bdev = tmp_dev->rdev->bdev; | 339 | bio->bi_bdev = tmp_dev->rdev->bdev; |
| 375 | bio->bi_sector = bio->bi_sector - tmp_dev->start_sector | 340 | bio->bi_sector = bio->bi_sector - start_sector |
| 376 | + tmp_dev->rdev->data_offset; | 341 | + tmp_dev->rdev->data_offset; |
| 342 | rcu_read_unlock(); | ||
| 377 | 343 | ||
| 378 | return 1; | 344 | return 1; |
| 379 | } | 345 | } |
| @@ -381,7 +347,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
| 381 | static void linear_status (struct seq_file *seq, mddev_t *mddev) | 347 | static void linear_status (struct seq_file *seq, mddev_t *mddev) |
| 382 | { | 348 | { |
| 383 | 349 | ||
| 384 | seq_printf(seq, " %dk rounding", mddev->chunk_size/1024); | 350 | seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); |
| 385 | } | 351 | } |
| 386 | 352 | ||
| 387 | 353 | ||
diff --git a/drivers/md/linear.h b/drivers/md/linear.h index bf8179587f95..0ce29b61605a 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h | |||
| @@ -3,27 +3,19 @@ | |||
| 3 | 3 | ||
| 4 | struct dev_info { | 4 | struct dev_info { |
| 5 | mdk_rdev_t *rdev; | 5 | mdk_rdev_t *rdev; |
| 6 | sector_t num_sectors; | 6 | sector_t end_sector; |
| 7 | sector_t start_sector; | ||
| 8 | }; | 7 | }; |
| 9 | 8 | ||
| 10 | typedef struct dev_info dev_info_t; | 9 | typedef struct dev_info dev_info_t; |
| 11 | 10 | ||
| 12 | struct linear_private_data | 11 | struct linear_private_data |
| 13 | { | 12 | { |
| 14 | struct linear_private_data *prev; /* earlier version */ | ||
| 15 | dev_info_t **hash_table; | ||
| 16 | sector_t spacing; | ||
| 17 | sector_t array_sectors; | 13 | sector_t array_sectors; |
| 18 | int sector_shift; /* shift before dividing | ||
| 19 | * by spacing | ||
| 20 | */ | ||
| 21 | dev_info_t disks[0]; | 14 | dev_info_t disks[0]; |
| 15 | struct rcu_head rcu; | ||
| 22 | }; | 16 | }; |
| 23 | 17 | ||
| 24 | 18 | ||
| 25 | typedef struct linear_private_data linear_conf_t; | 19 | typedef struct linear_private_data linear_conf_t; |
| 26 | 20 | ||
| 27 | #define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) | ||
| 28 | |||
| 29 | #endif | 21 | #endif |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 641b211fe3fe..09be637d52cb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -440,15 +440,6 @@ static inline sector_t calc_dev_sboffset(struct block_device *bdev) | |||
| 440 | return MD_NEW_SIZE_SECTORS(num_sectors); | 440 | return MD_NEW_SIZE_SECTORS(num_sectors); |
| 441 | } | 441 | } |
| 442 | 442 | ||
| 443 | static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) | ||
| 444 | { | ||
| 445 | sector_t num_sectors = rdev->sb_start; | ||
| 446 | |||
| 447 | if (chunk_size) | ||
| 448 | num_sectors &= ~((sector_t)chunk_size/512 - 1); | ||
| 449 | return num_sectors; | ||
| 450 | } | ||
| 451 | |||
| 452 | static int alloc_disk_sb(mdk_rdev_t * rdev) | 443 | static int alloc_disk_sb(mdk_rdev_t * rdev) |
| 453 | { | 444 | { |
| 454 | if (rdev->sb_page) | 445 | if (rdev->sb_page) |
| @@ -745,6 +736,24 @@ struct super_type { | |||
| 745 | }; | 736 | }; |
| 746 | 737 | ||
| 747 | /* | 738 | /* |
| 739 | * Check that the given mddev has no bitmap. | ||
| 740 | * | ||
| 741 | * This function is called from the run method of all personalities that do not | ||
| 742 | * support bitmaps. It prints an error message and returns non-zero if mddev | ||
| 743 | * has a bitmap. Otherwise, it returns 0. | ||
| 744 | * | ||
| 745 | */ | ||
| 746 | int md_check_no_bitmap(mddev_t *mddev) | ||
| 747 | { | ||
| 748 | if (!mddev->bitmap_file && !mddev->bitmap_offset) | ||
| 749 | return 0; | ||
| 750 | printk(KERN_ERR "%s: bitmaps are not supported for %s\n", | ||
| 751 | mdname(mddev), mddev->pers->name); | ||
| 752 | return 1; | ||
| 753 | } | ||
| 754 | EXPORT_SYMBOL(md_check_no_bitmap); | ||
| 755 | |||
| 756 | /* | ||
| 748 | * load_super for 0.90.0 | 757 | * load_super for 0.90.0 |
| 749 | */ | 758 | */ |
| 750 | static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | 759 | static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) |
| @@ -797,17 +806,6 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
| 797 | rdev->data_offset = 0; | 806 | rdev->data_offset = 0; |
| 798 | rdev->sb_size = MD_SB_BYTES; | 807 | rdev->sb_size = MD_SB_BYTES; |
| 799 | 808 | ||
| 800 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { | ||
| 801 | if (sb->level != 1 && sb->level != 4 | ||
| 802 | && sb->level != 5 && sb->level != 6 | ||
| 803 | && sb->level != 10) { | ||
| 804 | /* FIXME use a better test */ | ||
| 805 | printk(KERN_WARNING | ||
| 806 | "md: bitmaps not supported for this level.\n"); | ||
| 807 | goto abort; | ||
| 808 | } | ||
| 809 | } | ||
| 810 | |||
| 811 | if (sb->level == LEVEL_MULTIPATH) | 809 | if (sb->level == LEVEL_MULTIPATH) |
| 812 | rdev->desc_nr = -1; | 810 | rdev->desc_nr = -1; |
| 813 | else | 811 | else |
| @@ -836,7 +834,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
| 836 | else | 834 | else |
| 837 | ret = 0; | 835 | ret = 0; |
| 838 | } | 836 | } |
| 839 | rdev->sectors = calc_num_sectors(rdev, sb->chunk_size); | 837 | rdev->sectors = rdev->sb_start; |
| 840 | 838 | ||
| 841 | if (rdev->sectors < sb->size * 2 && sb->level > 1) | 839 | if (rdev->sectors < sb->size * 2 && sb->level > 1) |
| 842 | /* "this cannot possibly happen" ... */ | 840 | /* "this cannot possibly happen" ... */ |
| @@ -866,7 +864,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 866 | mddev->minor_version = sb->minor_version; | 864 | mddev->minor_version = sb->minor_version; |
| 867 | mddev->patch_version = sb->patch_version; | 865 | mddev->patch_version = sb->patch_version; |
| 868 | mddev->external = 0; | 866 | mddev->external = 0; |
| 869 | mddev->chunk_size = sb->chunk_size; | 867 | mddev->chunk_sectors = sb->chunk_size >> 9; |
| 870 | mddev->ctime = sb->ctime; | 868 | mddev->ctime = sb->ctime; |
| 871 | mddev->utime = sb->utime; | 869 | mddev->utime = sb->utime; |
| 872 | mddev->level = sb->level; | 870 | mddev->level = sb->level; |
| @@ -883,13 +881,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 883 | mddev->delta_disks = sb->delta_disks; | 881 | mddev->delta_disks = sb->delta_disks; |
| 884 | mddev->new_level = sb->new_level; | 882 | mddev->new_level = sb->new_level; |
| 885 | mddev->new_layout = sb->new_layout; | 883 | mddev->new_layout = sb->new_layout; |
| 886 | mddev->new_chunk = sb->new_chunk; | 884 | mddev->new_chunk_sectors = sb->new_chunk >> 9; |
| 887 | } else { | 885 | } else { |
| 888 | mddev->reshape_position = MaxSector; | 886 | mddev->reshape_position = MaxSector; |
| 889 | mddev->delta_disks = 0; | 887 | mddev->delta_disks = 0; |
| 890 | mddev->new_level = mddev->level; | 888 | mddev->new_level = mddev->level; |
| 891 | mddev->new_layout = mddev->layout; | 889 | mddev->new_layout = mddev->layout; |
| 892 | mddev->new_chunk = mddev->chunk_size; | 890 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
| 893 | } | 891 | } |
| 894 | 892 | ||
| 895 | if (sb->state & (1<<MD_SB_CLEAN)) | 893 | if (sb->state & (1<<MD_SB_CLEAN)) |
| @@ -1004,7 +1002,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1004 | sb->new_level = mddev->new_level; | 1002 | sb->new_level = mddev->new_level; |
| 1005 | sb->delta_disks = mddev->delta_disks; | 1003 | sb->delta_disks = mddev->delta_disks; |
| 1006 | sb->new_layout = mddev->new_layout; | 1004 | sb->new_layout = mddev->new_layout; |
| 1007 | sb->new_chunk = mddev->new_chunk; | 1005 | sb->new_chunk = mddev->new_chunk_sectors << 9; |
| 1008 | } | 1006 | } |
| 1009 | mddev->minor_version = sb->minor_version; | 1007 | mddev->minor_version = sb->minor_version; |
| 1010 | if (mddev->in_sync) | 1008 | if (mddev->in_sync) |
| @@ -1018,7 +1016,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1018 | sb->recovery_cp = 0; | 1016 | sb->recovery_cp = 0; |
| 1019 | 1017 | ||
| 1020 | sb->layout = mddev->layout; | 1018 | sb->layout = mddev->layout; |
| 1021 | sb->chunk_size = mddev->chunk_size; | 1019 | sb->chunk_size = mddev->chunk_sectors << 9; |
| 1022 | 1020 | ||
| 1023 | if (mddev->bitmap && mddev->bitmap_file == NULL) | 1021 | if (mddev->bitmap && mddev->bitmap_file == NULL) |
| 1024 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); | 1022 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); |
| @@ -1185,24 +1183,13 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 1185 | bdevname(rdev->bdev,b)); | 1183 | bdevname(rdev->bdev,b)); |
| 1186 | return -EINVAL; | 1184 | return -EINVAL; |
| 1187 | } | 1185 | } |
| 1188 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { | ||
| 1189 | if (sb->level != cpu_to_le32(1) && | ||
| 1190 | sb->level != cpu_to_le32(4) && | ||
| 1191 | sb->level != cpu_to_le32(5) && | ||
| 1192 | sb->level != cpu_to_le32(6) && | ||
| 1193 | sb->level != cpu_to_le32(10)) { | ||
| 1194 | printk(KERN_WARNING | ||
| 1195 | "md: bitmaps not supported for this level.\n"); | ||
| 1196 | return -EINVAL; | ||
| 1197 | } | ||
| 1198 | } | ||
| 1199 | 1186 | ||
| 1200 | rdev->preferred_minor = 0xffff; | 1187 | rdev->preferred_minor = 0xffff; |
| 1201 | rdev->data_offset = le64_to_cpu(sb->data_offset); | 1188 | rdev->data_offset = le64_to_cpu(sb->data_offset); |
| 1202 | atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); | 1189 | atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); |
| 1203 | 1190 | ||
| 1204 | rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; | 1191 | rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; |
| 1205 | bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; | 1192 | bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; |
| 1206 | if (rdev->sb_size & bmask) | 1193 | if (rdev->sb_size & bmask) |
| 1207 | rdev->sb_size = (rdev->sb_size | bmask) + 1; | 1194 | rdev->sb_size = (rdev->sb_size | bmask) + 1; |
| 1208 | 1195 | ||
| @@ -1248,9 +1235,6 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 1248 | if (rdev->sectors < le64_to_cpu(sb->data_size)) | 1235 | if (rdev->sectors < le64_to_cpu(sb->data_size)) |
| 1249 | return -EINVAL; | 1236 | return -EINVAL; |
| 1250 | rdev->sectors = le64_to_cpu(sb->data_size); | 1237 | rdev->sectors = le64_to_cpu(sb->data_size); |
| 1251 | if (le32_to_cpu(sb->chunksize)) | ||
| 1252 | rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1); | ||
| 1253 | |||
| 1254 | if (le64_to_cpu(sb->size) > rdev->sectors) | 1238 | if (le64_to_cpu(sb->size) > rdev->sectors) |
| 1255 | return -EINVAL; | 1239 | return -EINVAL; |
| 1256 | return ret; | 1240 | return ret; |
| @@ -1271,7 +1255,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1271 | mddev->major_version = 1; | 1255 | mddev->major_version = 1; |
| 1272 | mddev->patch_version = 0; | 1256 | mddev->patch_version = 0; |
| 1273 | mddev->external = 0; | 1257 | mddev->external = 0; |
| 1274 | mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; | 1258 | mddev->chunk_sectors = le32_to_cpu(sb->chunksize); |
| 1275 | mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); | 1259 | mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); |
| 1276 | mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); | 1260 | mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); |
| 1277 | mddev->level = le32_to_cpu(sb->level); | 1261 | mddev->level = le32_to_cpu(sb->level); |
| @@ -1297,13 +1281,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1297 | mddev->delta_disks = le32_to_cpu(sb->delta_disks); | 1281 | mddev->delta_disks = le32_to_cpu(sb->delta_disks); |
| 1298 | mddev->new_level = le32_to_cpu(sb->new_level); | 1282 | mddev->new_level = le32_to_cpu(sb->new_level); |
| 1299 | mddev->new_layout = le32_to_cpu(sb->new_layout); | 1283 | mddev->new_layout = le32_to_cpu(sb->new_layout); |
| 1300 | mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; | 1284 | mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); |
| 1301 | } else { | 1285 | } else { |
| 1302 | mddev->reshape_position = MaxSector; | 1286 | mddev->reshape_position = MaxSector; |
| 1303 | mddev->delta_disks = 0; | 1287 | mddev->delta_disks = 0; |
| 1304 | mddev->new_level = mddev->level; | 1288 | mddev->new_level = mddev->level; |
| 1305 | mddev->new_layout = mddev->layout; | 1289 | mddev->new_layout = mddev->layout; |
| 1306 | mddev->new_chunk = mddev->chunk_size; | 1290 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
| 1307 | } | 1291 | } |
| 1308 | 1292 | ||
| 1309 | } else if (mddev->pers == NULL) { | 1293 | } else if (mddev->pers == NULL) { |
| @@ -1375,7 +1359,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1375 | 1359 | ||
| 1376 | sb->raid_disks = cpu_to_le32(mddev->raid_disks); | 1360 | sb->raid_disks = cpu_to_le32(mddev->raid_disks); |
| 1377 | sb->size = cpu_to_le64(mddev->dev_sectors); | 1361 | sb->size = cpu_to_le64(mddev->dev_sectors); |
| 1378 | sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9); | 1362 | sb->chunksize = cpu_to_le32(mddev->chunk_sectors); |
| 1379 | sb->level = cpu_to_le32(mddev->level); | 1363 | sb->level = cpu_to_le32(mddev->level); |
| 1380 | sb->layout = cpu_to_le32(mddev->layout); | 1364 | sb->layout = cpu_to_le32(mddev->layout); |
| 1381 | 1365 | ||
| @@ -1402,7 +1386,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1402 | sb->new_layout = cpu_to_le32(mddev->new_layout); | 1386 | sb->new_layout = cpu_to_le32(mddev->new_layout); |
| 1403 | sb->delta_disks = cpu_to_le32(mddev->delta_disks); | 1387 | sb->delta_disks = cpu_to_le32(mddev->delta_disks); |
| 1404 | sb->new_level = cpu_to_le32(mddev->new_level); | 1388 | sb->new_level = cpu_to_le32(mddev->new_level); |
| 1405 | sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); | 1389 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
| 1406 | } | 1390 | } |
| 1407 | 1391 | ||
| 1408 | max_dev = 0; | 1392 | max_dev = 0; |
| @@ -1897,6 +1881,7 @@ static void md_update_sb(mddev_t * mddev, int force_change) | |||
| 1897 | int sync_req; | 1881 | int sync_req; |
| 1898 | int nospares = 0; | 1882 | int nospares = 0; |
| 1899 | 1883 | ||
| 1884 | mddev->utime = get_seconds(); | ||
| 1900 | if (mddev->external) | 1885 | if (mddev->external) |
| 1901 | return; | 1886 | return; |
| 1902 | repeat: | 1887 | repeat: |
| @@ -1926,7 +1911,6 @@ repeat: | |||
| 1926 | nospares = 0; | 1911 | nospares = 0; |
| 1927 | 1912 | ||
| 1928 | sync_req = mddev->in_sync; | 1913 | sync_req = mddev->in_sync; |
| 1929 | mddev->utime = get_seconds(); | ||
| 1930 | 1914 | ||
| 1931 | /* If this is just a dirty<->clean transition, and the array is clean | 1915 | /* If this is just a dirty<->clean transition, and the array is clean |
| 1932 | * and 'events' is odd, we can roll back to the previous clean state */ | 1916 | * and 'events' is odd, we can roll back to the previous clean state */ |
| @@ -2597,15 +2581,6 @@ static void analyze_sbs(mddev_t * mddev) | |||
| 2597 | clear_bit(In_sync, &rdev->flags); | 2581 | clear_bit(In_sync, &rdev->flags); |
| 2598 | } | 2582 | } |
| 2599 | } | 2583 | } |
| 2600 | |||
| 2601 | |||
| 2602 | |||
| 2603 | if (mddev->recovery_cp != MaxSector && | ||
| 2604 | mddev->level >= 1) | ||
| 2605 | printk(KERN_ERR "md: %s: raid array is not clean" | ||
| 2606 | " -- starting background reconstruction\n", | ||
| 2607 | mdname(mddev)); | ||
| 2608 | |||
| 2609 | } | 2584 | } |
| 2610 | 2585 | ||
| 2611 | static void md_safemode_timeout(unsigned long data); | 2586 | static void md_safemode_timeout(unsigned long data); |
| @@ -2746,7 +2721,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 2746 | if (IS_ERR(priv)) { | 2721 | if (IS_ERR(priv)) { |
| 2747 | mddev->new_level = mddev->level; | 2722 | mddev->new_level = mddev->level; |
| 2748 | mddev->new_layout = mddev->layout; | 2723 | mddev->new_layout = mddev->layout; |
| 2749 | mddev->new_chunk = mddev->chunk_size; | 2724 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
| 2750 | mddev->raid_disks -= mddev->delta_disks; | 2725 | mddev->raid_disks -= mddev->delta_disks; |
| 2751 | mddev->delta_disks = 0; | 2726 | mddev->delta_disks = 0; |
| 2752 | module_put(pers->owner); | 2727 | module_put(pers->owner); |
| @@ -2764,7 +2739,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 2764 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); | 2739 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); |
| 2765 | mddev->level = mddev->new_level; | 2740 | mddev->level = mddev->new_level; |
| 2766 | mddev->layout = mddev->new_layout; | 2741 | mddev->layout = mddev->new_layout; |
| 2767 | mddev->chunk_size = mddev->new_chunk; | 2742 | mddev->chunk_sectors = mddev->new_chunk_sectors; |
| 2768 | mddev->delta_disks = 0; | 2743 | mddev->delta_disks = 0; |
| 2769 | pers->run(mddev); | 2744 | pers->run(mddev); |
| 2770 | mddev_resume(mddev); | 2745 | mddev_resume(mddev); |
| @@ -2800,11 +2775,14 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 2800 | 2775 | ||
| 2801 | if (mddev->pers) { | 2776 | if (mddev->pers) { |
| 2802 | int err; | 2777 | int err; |
| 2803 | if (mddev->pers->reconfig == NULL) | 2778 | if (mddev->pers->check_reshape == NULL) |
| 2804 | return -EBUSY; | 2779 | return -EBUSY; |
| 2805 | err = mddev->pers->reconfig(mddev, n, -1); | 2780 | mddev->new_layout = n; |
| 2806 | if (err) | 2781 | err = mddev->pers->check_reshape(mddev); |
| 2782 | if (err) { | ||
| 2783 | mddev->new_layout = mddev->layout; | ||
| 2807 | return err; | 2784 | return err; |
| 2785 | } | ||
| 2808 | } else { | 2786 | } else { |
| 2809 | mddev->new_layout = n; | 2787 | mddev->new_layout = n; |
| 2810 | if (mddev->reshape_position == MaxSector) | 2788 | if (mddev->reshape_position == MaxSector) |
| @@ -2857,10 +2835,11 @@ static ssize_t | |||
| 2857 | chunk_size_show(mddev_t *mddev, char *page) | 2835 | chunk_size_show(mddev_t *mddev, char *page) |
| 2858 | { | 2836 | { |
| 2859 | if (mddev->reshape_position != MaxSector && | 2837 | if (mddev->reshape_position != MaxSector && |
| 2860 | mddev->chunk_size != mddev->new_chunk) | 2838 | mddev->chunk_sectors != mddev->new_chunk_sectors) |
| 2861 | return sprintf(page, "%d (%d)\n", mddev->new_chunk, | 2839 | return sprintf(page, "%d (%d)\n", |
| 2862 | mddev->chunk_size); | 2840 | mddev->new_chunk_sectors << 9, |
| 2863 | return sprintf(page, "%d\n", mddev->chunk_size); | 2841 | mddev->chunk_sectors << 9); |
| 2842 | return sprintf(page, "%d\n", mddev->chunk_sectors << 9); | ||
| 2864 | } | 2843 | } |
| 2865 | 2844 | ||
| 2866 | static ssize_t | 2845 | static ssize_t |
| @@ -2874,15 +2853,18 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 2874 | 2853 | ||
| 2875 | if (mddev->pers) { | 2854 | if (mddev->pers) { |
| 2876 | int err; | 2855 | int err; |
| 2877 | if (mddev->pers->reconfig == NULL) | 2856 | if (mddev->pers->check_reshape == NULL) |
| 2878 | return -EBUSY; | 2857 | return -EBUSY; |
| 2879 | err = mddev->pers->reconfig(mddev, -1, n); | 2858 | mddev->new_chunk_sectors = n >> 9; |
| 2880 | if (err) | 2859 | err = mddev->pers->check_reshape(mddev); |
| 2860 | if (err) { | ||
| 2861 | mddev->new_chunk_sectors = mddev->chunk_sectors; | ||
| 2881 | return err; | 2862 | return err; |
| 2863 | } | ||
| 2882 | } else { | 2864 | } else { |
| 2883 | mddev->new_chunk = n; | 2865 | mddev->new_chunk_sectors = n >> 9; |
| 2884 | if (mddev->reshape_position == MaxSector) | 2866 | if (mddev->reshape_position == MaxSector) |
| 2885 | mddev->chunk_size = n; | 2867 | mddev->chunk_sectors = n >> 9; |
| 2886 | } | 2868 | } |
| 2887 | return len; | 2869 | return len; |
| 2888 | } | 2870 | } |
| @@ -3527,8 +3509,9 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 3527 | return -EBUSY; | 3509 | return -EBUSY; |
| 3528 | 3510 | ||
| 3529 | /* Must be a multiple of chunk_size */ | 3511 | /* Must be a multiple of chunk_size */ |
| 3530 | if (mddev->chunk_size) { | 3512 | if (mddev->chunk_sectors) { |
| 3531 | if (min & (sector_t)((mddev->chunk_size>>9)-1)) | 3513 | sector_t temp = min; |
| 3514 | if (sector_div(temp, mddev->chunk_sectors)) | ||
| 3532 | return -EINVAL; | 3515 | return -EINVAL; |
| 3533 | } | 3516 | } |
| 3534 | mddev->resync_min = min; | 3517 | mddev->resync_min = min; |
| @@ -3564,8 +3547,9 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 3564 | return -EBUSY; | 3547 | return -EBUSY; |
| 3565 | 3548 | ||
| 3566 | /* Must be a multiple of chunk_size */ | 3549 | /* Must be a multiple of chunk_size */ |
| 3567 | if (mddev->chunk_size) { | 3550 | if (mddev->chunk_sectors) { |
| 3568 | if (max & (sector_t)((mddev->chunk_size>>9)-1)) | 3551 | sector_t temp = max; |
| 3552 | if (sector_div(temp, mddev->chunk_sectors)) | ||
| 3569 | return -EINVAL; | 3553 | return -EINVAL; |
| 3570 | } | 3554 | } |
| 3571 | mddev->resync_max = max; | 3555 | mddev->resync_max = max; |
| @@ -3656,7 +3640,7 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 3656 | mddev->delta_disks = 0; | 3640 | mddev->delta_disks = 0; |
| 3657 | mddev->new_level = mddev->level; | 3641 | mddev->new_level = mddev->level; |
| 3658 | mddev->new_layout = mddev->layout; | 3642 | mddev->new_layout = mddev->layout; |
| 3659 | mddev->new_chunk = mddev->chunk_size; | 3643 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
| 3660 | return len; | 3644 | return len; |
| 3661 | } | 3645 | } |
| 3662 | 3646 | ||
| @@ -3976,11 +3960,9 @@ static int start_dirty_degraded; | |||
| 3976 | static int do_md_run(mddev_t * mddev) | 3960 | static int do_md_run(mddev_t * mddev) |
| 3977 | { | 3961 | { |
| 3978 | int err; | 3962 | int err; |
| 3979 | int chunk_size; | ||
| 3980 | mdk_rdev_t *rdev; | 3963 | mdk_rdev_t *rdev; |
| 3981 | struct gendisk *disk; | 3964 | struct gendisk *disk; |
| 3982 | struct mdk_personality *pers; | 3965 | struct mdk_personality *pers; |
| 3983 | char b[BDEVNAME_SIZE]; | ||
| 3984 | 3966 | ||
| 3985 | if (list_empty(&mddev->disks)) | 3967 | if (list_empty(&mddev->disks)) |
| 3986 | /* cannot run an array with no devices.. */ | 3968 | /* cannot run an array with no devices.. */ |
| @@ -3998,38 +3980,6 @@ static int do_md_run(mddev_t * mddev) | |||
| 3998 | analyze_sbs(mddev); | 3980 | analyze_sbs(mddev); |
| 3999 | } | 3981 | } |
| 4000 | 3982 | ||
| 4001 | chunk_size = mddev->chunk_size; | ||
| 4002 | |||
| 4003 | if (chunk_size) { | ||
| 4004 | if (chunk_size > MAX_CHUNK_SIZE) { | ||
| 4005 | printk(KERN_ERR "too big chunk_size: %d > %d\n", | ||
| 4006 | chunk_size, MAX_CHUNK_SIZE); | ||
| 4007 | return -EINVAL; | ||
| 4008 | } | ||
| 4009 | /* | ||
| 4010 | * chunk-size has to be a power of 2 | ||
| 4011 | */ | ||
| 4012 | if ( (1 << ffz(~chunk_size)) != chunk_size) { | ||
| 4013 | printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); | ||
| 4014 | return -EINVAL; | ||
| 4015 | } | ||
| 4016 | |||
| 4017 | /* devices must have minimum size of one chunk */ | ||
| 4018 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
| 4019 | if (test_bit(Faulty, &rdev->flags)) | ||
| 4020 | continue; | ||
| 4021 | if (rdev->sectors < chunk_size / 512) { | ||
| 4022 | printk(KERN_WARNING | ||
| 4023 | "md: Dev %s smaller than chunk_size:" | ||
| 4024 | " %llu < %d\n", | ||
| 4025 | bdevname(rdev->bdev,b), | ||
| 4026 | (unsigned long long)rdev->sectors, | ||
| 4027 | chunk_size / 512); | ||
| 4028 | return -EINVAL; | ||
| 4029 | } | ||
| 4030 | } | ||
| 4031 | } | ||
| 4032 | |||
| 4033 | if (mddev->level != LEVEL_NONE) | 3983 | if (mddev->level != LEVEL_NONE) |
| 4034 | request_module("md-level-%d", mddev->level); | 3984 | request_module("md-level-%d", mddev->level); |
| 4035 | else if (mddev->clevel[0]) | 3985 | else if (mddev->clevel[0]) |
| @@ -4405,7 +4355,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
| 4405 | mddev->flags = 0; | 4355 | mddev->flags = 0; |
| 4406 | mddev->ro = 0; | 4356 | mddev->ro = 0; |
| 4407 | mddev->metadata_type[0] = 0; | 4357 | mddev->metadata_type[0] = 0; |
| 4408 | mddev->chunk_size = 0; | 4358 | mddev->chunk_sectors = 0; |
| 4409 | mddev->ctime = mddev->utime = 0; | 4359 | mddev->ctime = mddev->utime = 0; |
| 4410 | mddev->layout = 0; | 4360 | mddev->layout = 0; |
| 4411 | mddev->max_disks = 0; | 4361 | mddev->max_disks = 0; |
| @@ -4413,7 +4363,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
| 4413 | mddev->delta_disks = 0; | 4363 | mddev->delta_disks = 0; |
| 4414 | mddev->new_level = LEVEL_NONE; | 4364 | mddev->new_level = LEVEL_NONE; |
| 4415 | mddev->new_layout = 0; | 4365 | mddev->new_layout = 0; |
| 4416 | mddev->new_chunk = 0; | 4366 | mddev->new_chunk_sectors = 0; |
| 4417 | mddev->curr_resync = 0; | 4367 | mddev->curr_resync = 0; |
| 4418 | mddev->resync_mismatches = 0; | 4368 | mddev->resync_mismatches = 0; |
| 4419 | mddev->suspend_lo = mddev->suspend_hi = 0; | 4369 | mddev->suspend_lo = mddev->suspend_hi = 0; |
| @@ -4618,7 +4568,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
| 4618 | info.spare_disks = spare; | 4568 | info.spare_disks = spare; |
| 4619 | 4569 | ||
| 4620 | info.layout = mddev->layout; | 4570 | info.layout = mddev->layout; |
| 4621 | info.chunk_size = mddev->chunk_size; | 4571 | info.chunk_size = mddev->chunk_sectors << 9; |
| 4622 | 4572 | ||
| 4623 | if (copy_to_user(arg, &info, sizeof(info))) | 4573 | if (copy_to_user(arg, &info, sizeof(info))) |
| 4624 | return -EFAULT; | 4574 | return -EFAULT; |
| @@ -4843,7 +4793,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
| 4843 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; | 4793 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; |
| 4844 | } else | 4794 | } else |
| 4845 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 4795 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); |
| 4846 | rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); | 4796 | rdev->sectors = rdev->sb_start; |
| 4847 | 4797 | ||
| 4848 | err = bind_rdev_to_array(rdev, mddev); | 4798 | err = bind_rdev_to_array(rdev, mddev); |
| 4849 | if (err) { | 4799 | if (err) { |
| @@ -4913,7 +4863,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) | |||
| 4913 | else | 4863 | else |
| 4914 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; | 4864 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; |
| 4915 | 4865 | ||
| 4916 | rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); | 4866 | rdev->sectors = rdev->sb_start; |
| 4917 | 4867 | ||
| 4918 | if (test_bit(Faulty, &rdev->flags)) { | 4868 | if (test_bit(Faulty, &rdev->flags)) { |
| 4919 | printk(KERN_WARNING | 4869 | printk(KERN_WARNING |
| @@ -5062,7 +5012,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | |||
| 5062 | mddev->external = 0; | 5012 | mddev->external = 0; |
| 5063 | 5013 | ||
| 5064 | mddev->layout = info->layout; | 5014 | mddev->layout = info->layout; |
| 5065 | mddev->chunk_size = info->chunk_size; | 5015 | mddev->chunk_sectors = info->chunk_size >> 9; |
| 5066 | 5016 | ||
| 5067 | mddev->max_disks = MD_SB_DISKS; | 5017 | mddev->max_disks = MD_SB_DISKS; |
| 5068 | 5018 | ||
| @@ -5081,7 +5031,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | |||
| 5081 | get_random_bytes(mddev->uuid, 16); | 5031 | get_random_bytes(mddev->uuid, 16); |
| 5082 | 5032 | ||
| 5083 | mddev->new_level = mddev->level; | 5033 | mddev->new_level = mddev->level; |
| 5084 | mddev->new_chunk = mddev->chunk_size; | 5034 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
| 5085 | mddev->new_layout = mddev->layout; | 5035 | mddev->new_layout = mddev->layout; |
| 5086 | mddev->delta_disks = 0; | 5036 | mddev->delta_disks = 0; |
| 5087 | 5037 | ||
| @@ -5191,7 +5141,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
| 5191 | mddev->level != info->level || | 5141 | mddev->level != info->level || |
| 5192 | /* mddev->layout != info->layout || */ | 5142 | /* mddev->layout != info->layout || */ |
| 5193 | !mddev->persistent != info->not_persistent|| | 5143 | !mddev->persistent != info->not_persistent|| |
| 5194 | mddev->chunk_size != info->chunk_size || | 5144 | mddev->chunk_sectors != info->chunk_size >> 9 || |
| 5195 | /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ | 5145 | /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ |
| 5196 | ((state^info->state) & 0xfffffe00) | 5146 | ((state^info->state) & 0xfffffe00) |
| 5197 | ) | 5147 | ) |
| @@ -5215,10 +5165,15 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
| 5215 | * we don't need to do anything at the md level, the | 5165 | * we don't need to do anything at the md level, the |
| 5216 | * personality will take care of it all. | 5166 | * personality will take care of it all. |
| 5217 | */ | 5167 | */ |
| 5218 | if (mddev->pers->reconfig == NULL) | 5168 | if (mddev->pers->check_reshape == NULL) |
| 5219 | return -EINVAL; | 5169 | return -EINVAL; |
| 5220 | else | 5170 | else { |
| 5221 | return mddev->pers->reconfig(mddev, info->layout, -1); | 5171 | mddev->new_layout = info->layout; |
| 5172 | rv = mddev->pers->check_reshape(mddev); | ||
| 5173 | if (rv) | ||
| 5174 | mddev->new_layout = mddev->layout; | ||
| 5175 | return rv; | ||
| 5176 | } | ||
| 5222 | } | 5177 | } |
| 5223 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) | 5178 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) |
| 5224 | rv = update_size(mddev, (sector_t)info->size * 2); | 5179 | rv = update_size(mddev, (sector_t)info->size * 2); |
| @@ -6717,7 +6672,8 @@ void md_check_recovery(mddev_t *mddev) | |||
| 6717 | */ | 6672 | */ |
| 6718 | 6673 | ||
| 6719 | if (mddev->reshape_position != MaxSector) { | 6674 | if (mddev->reshape_position != MaxSector) { |
| 6720 | if (mddev->pers->check_reshape(mddev) != 0) | 6675 | if (mddev->pers->check_reshape == NULL || |
| 6676 | mddev->pers->check_reshape(mddev) != 0) | ||
| 6721 | /* Cannot proceed */ | 6677 | /* Cannot proceed */ |
| 6722 | goto unlock; | 6678 | goto unlock; |
| 6723 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | 6679 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 8227ab909d44..9430a110db93 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
| @@ -30,13 +30,6 @@ typedef struct mddev_s mddev_t; | |||
| 30 | typedef struct mdk_rdev_s mdk_rdev_t; | 30 | typedef struct mdk_rdev_s mdk_rdev_t; |
| 31 | 31 | ||
| 32 | /* | 32 | /* |
| 33 | * options passed in raidrun: | ||
| 34 | */ | ||
| 35 | |||
| 36 | /* Currently this must fit in an 'int' */ | ||
| 37 | #define MAX_CHUNK_SIZE (1<<30) | ||
| 38 | |||
| 39 | /* | ||
| 40 | * MD's 'extended' device | 33 | * MD's 'extended' device |
| 41 | */ | 34 | */ |
| 42 | struct mdk_rdev_s | 35 | struct mdk_rdev_s |
| @@ -145,7 +138,7 @@ struct mddev_s | |||
| 145 | int external; /* metadata is | 138 | int external; /* metadata is |
| 146 | * managed externally */ | 139 | * managed externally */ |
| 147 | char metadata_type[17]; /* externally set*/ | 140 | char metadata_type[17]; /* externally set*/ |
| 148 | int chunk_size; | 141 | int chunk_sectors; |
| 149 | time_t ctime, utime; | 142 | time_t ctime, utime; |
| 150 | int level, layout; | 143 | int level, layout; |
| 151 | char clevel[16]; | 144 | char clevel[16]; |
| @@ -166,7 +159,8 @@ struct mddev_s | |||
| 166 | * If reshape_position is MaxSector, then no reshape is happening (yet). | 159 | * If reshape_position is MaxSector, then no reshape is happening (yet). |
| 167 | */ | 160 | */ |
| 168 | sector_t reshape_position; | 161 | sector_t reshape_position; |
| 169 | int delta_disks, new_level, new_layout, new_chunk; | 162 | int delta_disks, new_level, new_layout; |
| 163 | int new_chunk_sectors; | ||
| 170 | 164 | ||
| 171 | struct mdk_thread_s *thread; /* management thread */ | 165 | struct mdk_thread_s *thread; /* management thread */ |
| 172 | struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ | 166 | struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ |
| @@ -325,7 +319,6 @@ struct mdk_personality | |||
| 325 | int (*check_reshape) (mddev_t *mddev); | 319 | int (*check_reshape) (mddev_t *mddev); |
| 326 | int (*start_reshape) (mddev_t *mddev); | 320 | int (*start_reshape) (mddev_t *mddev); |
| 327 | void (*finish_reshape) (mddev_t *mddev); | 321 | void (*finish_reshape) (mddev_t *mddev); |
| 328 | int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); | ||
| 329 | /* quiesce moves between quiescence states | 322 | /* quiesce moves between quiescence states |
| 330 | * 0 - fully active | 323 | * 0 - fully active |
| 331 | * 1 - no new requests allowed | 324 | * 1 - no new requests allowed |
| @@ -437,5 +430,6 @@ extern void md_new_event(mddev_t *mddev); | |||
| 437 | extern int md_allow_write(mddev_t *mddev); | 430 | extern int md_allow_write(mddev_t *mddev); |
| 438 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | 431 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); |
| 439 | extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); | 432 | extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); |
| 433 | extern int md_check_no_bitmap(mddev_t *mddev); | ||
| 440 | 434 | ||
| 441 | #endif /* _MD_MD_H */ | 435 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 41ced0cbe823..cbe368fa6598 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
| @@ -58,7 +58,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) | |||
| 58 | { | 58 | { |
| 59 | unsigned long flags; | 59 | unsigned long flags; |
| 60 | mddev_t *mddev = mp_bh->mddev; | 60 | mddev_t *mddev = mp_bh->mddev; |
| 61 | multipath_conf_t *conf = mddev_to_conf(mddev); | 61 | multipath_conf_t *conf = mddev->private; |
| 62 | 62 | ||
| 63 | spin_lock_irqsave(&conf->device_lock, flags); | 63 | spin_lock_irqsave(&conf->device_lock, flags); |
| 64 | list_add(&mp_bh->retry_list, &conf->retry_list); | 64 | list_add(&mp_bh->retry_list, &conf->retry_list); |
| @@ -75,7 +75,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) | |||
| 75 | static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) | 75 | static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) |
| 76 | { | 76 | { |
| 77 | struct bio *bio = mp_bh->master_bio; | 77 | struct bio *bio = mp_bh->master_bio; |
| 78 | multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); | 78 | multipath_conf_t *conf = mp_bh->mddev->private; |
| 79 | 79 | ||
| 80 | bio_endio(bio, err); | 80 | bio_endio(bio, err); |
| 81 | mempool_free(mp_bh, conf->pool); | 81 | mempool_free(mp_bh, conf->pool); |
| @@ -85,7 +85,7 @@ static void multipath_end_request(struct bio *bio, int error) | |||
| 85 | { | 85 | { |
| 86 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 86 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| 87 | struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); | 87 | struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); |
| 88 | multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); | 88 | multipath_conf_t *conf = mp_bh->mddev->private; |
| 89 | mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; | 89 | mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; |
| 90 | 90 | ||
| 91 | if (uptodate) | 91 | if (uptodate) |
| @@ -107,7 +107,7 @@ static void multipath_end_request(struct bio *bio, int error) | |||
| 107 | 107 | ||
| 108 | static void unplug_slaves(mddev_t *mddev) | 108 | static void unplug_slaves(mddev_t *mddev) |
| 109 | { | 109 | { |
| 110 | multipath_conf_t *conf = mddev_to_conf(mddev); | 110 | multipath_conf_t *conf = mddev->private; |
| 111 | int i; | 111 | int i; |
| 112 | 112 | ||
| 113 | rcu_read_lock(); | 113 | rcu_read_lock(); |
| @@ -138,7 +138,7 @@ static void multipath_unplug(struct request_queue *q) | |||
| 138 | static int multipath_make_request (struct request_queue *q, struct bio * bio) | 138 | static int multipath_make_request (struct request_queue *q, struct bio * bio) |
| 139 | { | 139 | { |
| 140 | mddev_t *mddev = q->queuedata; | 140 | mddev_t *mddev = q->queuedata; |
| 141 | multipath_conf_t *conf = mddev_to_conf(mddev); | 141 | multipath_conf_t *conf = mddev->private; |
| 142 | struct multipath_bh * mp_bh; | 142 | struct multipath_bh * mp_bh; |
| 143 | struct multipath_info *multipath; | 143 | struct multipath_info *multipath; |
| 144 | const int rw = bio_data_dir(bio); | 144 | const int rw = bio_data_dir(bio); |
| @@ -180,7 +180,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) | |||
| 180 | 180 | ||
| 181 | static void multipath_status (struct seq_file *seq, mddev_t *mddev) | 181 | static void multipath_status (struct seq_file *seq, mddev_t *mddev) |
| 182 | { | 182 | { |
| 183 | multipath_conf_t *conf = mddev_to_conf(mddev); | 183 | multipath_conf_t *conf = mddev->private; |
| 184 | int i; | 184 | int i; |
| 185 | 185 | ||
| 186 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, | 186 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, |
| @@ -195,7 +195,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) | |||
| 195 | static int multipath_congested(void *data, int bits) | 195 | static int multipath_congested(void *data, int bits) |
| 196 | { | 196 | { |
| 197 | mddev_t *mddev = data; | 197 | mddev_t *mddev = data; |
| 198 | multipath_conf_t *conf = mddev_to_conf(mddev); | 198 | multipath_conf_t *conf = mddev->private; |
| 199 | int i, ret = 0; | 199 | int i, ret = 0; |
| 200 | 200 | ||
| 201 | rcu_read_lock(); | 201 | rcu_read_lock(); |
| @@ -220,7 +220,7 @@ static int multipath_congested(void *data, int bits) | |||
| 220 | */ | 220 | */ |
| 221 | static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) | 221 | static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) |
| 222 | { | 222 | { |
| 223 | multipath_conf_t *conf = mddev_to_conf(mddev); | 223 | multipath_conf_t *conf = mddev->private; |
| 224 | 224 | ||
| 225 | if (conf->working_disks <= 1) { | 225 | if (conf->working_disks <= 1) { |
| 226 | /* | 226 | /* |
| @@ -303,7 +303,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 303 | * merge_bvec_fn will be involved in multipath.) | 303 | * merge_bvec_fn will be involved in multipath.) |
| 304 | */ | 304 | */ |
| 305 | if (q->merge_bvec_fn && | 305 | if (q->merge_bvec_fn && |
| 306 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 306 | queue_max_sectors(q) > (PAGE_SIZE>>9)) |
| 307 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 307 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
| 308 | 308 | ||
| 309 | conf->working_disks++; | 309 | conf->working_disks++; |
| @@ -367,7 +367,7 @@ static void multipathd (mddev_t *mddev) | |||
| 367 | struct multipath_bh *mp_bh; | 367 | struct multipath_bh *mp_bh; |
| 368 | struct bio *bio; | 368 | struct bio *bio; |
| 369 | unsigned long flags; | 369 | unsigned long flags; |
| 370 | multipath_conf_t *conf = mddev_to_conf(mddev); | 370 | multipath_conf_t *conf = mddev->private; |
| 371 | struct list_head *head = &conf->retry_list; | 371 | struct list_head *head = &conf->retry_list; |
| 372 | 372 | ||
| 373 | md_check_recovery(mddev); | 373 | md_check_recovery(mddev); |
| @@ -421,6 +421,9 @@ static int multipath_run (mddev_t *mddev) | |||
| 421 | struct multipath_info *disk; | 421 | struct multipath_info *disk; |
| 422 | mdk_rdev_t *rdev; | 422 | mdk_rdev_t *rdev; |
| 423 | 423 | ||
| 424 | if (md_check_no_bitmap(mddev)) | ||
| 425 | return -EINVAL; | ||
| 426 | |||
| 424 | if (mddev->level != LEVEL_MULTIPATH) { | 427 | if (mddev->level != LEVEL_MULTIPATH) { |
| 425 | printk("multipath: %s: raid level not set to multipath IO (%d)\n", | 428 | printk("multipath: %s: raid level not set to multipath IO (%d)\n", |
| 426 | mdname(mddev), mddev->level); | 429 | mdname(mddev), mddev->level); |
| @@ -467,7 +470,7 @@ static int multipath_run (mddev_t *mddev) | |||
| 467 | * violating it, not that we ever expect a device with | 470 | * violating it, not that we ever expect a device with |
| 468 | * a merge_bvec_fn to be involved in multipath */ | 471 | * a merge_bvec_fn to be involved in multipath */ |
| 469 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 472 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && |
| 470 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 473 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
| 471 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 474 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
| 472 | 475 | ||
| 473 | if (!test_bit(Faulty, &rdev->flags)) | 476 | if (!test_bit(Faulty, &rdev->flags)) |
| @@ -531,7 +534,7 @@ out: | |||
| 531 | 534 | ||
| 532 | static int multipath_stop (mddev_t *mddev) | 535 | static int multipath_stop (mddev_t *mddev) |
| 533 | { | 536 | { |
| 534 | multipath_conf_t *conf = mddev_to_conf(mddev); | 537 | multipath_conf_t *conf = mddev->private; |
| 535 | 538 | ||
| 536 | md_unregister_thread(mddev->thread); | 539 | md_unregister_thread(mddev->thread); |
| 537 | mddev->thread = NULL; | 540 | mddev->thread = NULL; |
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h index 6fa70b400cda..d1c2a8d78395 100644 --- a/drivers/md/multipath.h +++ b/drivers/md/multipath.h | |||
| @@ -19,12 +19,6 @@ struct multipath_private_data { | |||
| 19 | typedef struct multipath_private_data multipath_conf_t; | 19 | typedef struct multipath_private_data multipath_conf_t; |
| 20 | 20 | ||
| 21 | /* | 21 | /* |
| 22 | * this is the only point in the RAID code where we violate | ||
| 23 | * C type safety. mddev->private is an 'opaque' pointer. | ||
| 24 | */ | ||
| 25 | #define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private) | ||
| 26 | |||
| 27 | /* | ||
| 28 | * this is our 'private' 'collective' MULTIPATH buffer head. | 22 | * this is our 'private' 'collective' MULTIPATH buffer head. |
| 29 | * it contains information about what kind of IO operations were started | 23 | * it contains information about what kind of IO operations were started |
| 30 | * for this MULTIPATH operation, and about their status: | 24 | * for this MULTIPATH operation, and about their status: |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c08d7559be55..ab4a489d8695 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
| @@ -26,8 +26,8 @@ | |||
| 26 | static void raid0_unplug(struct request_queue *q) | 26 | static void raid0_unplug(struct request_queue *q) |
| 27 | { | 27 | { |
| 28 | mddev_t *mddev = q->queuedata; | 28 | mddev_t *mddev = q->queuedata; |
| 29 | raid0_conf_t *conf = mddev_to_conf(mddev); | 29 | raid0_conf_t *conf = mddev->private; |
| 30 | mdk_rdev_t **devlist = conf->strip_zone[0].dev; | 30 | mdk_rdev_t **devlist = conf->devlist; |
| 31 | int i; | 31 | int i; |
| 32 | 32 | ||
| 33 | for (i=0; i<mddev->raid_disks; i++) { | 33 | for (i=0; i<mddev->raid_disks; i++) { |
| @@ -40,8 +40,8 @@ static void raid0_unplug(struct request_queue *q) | |||
| 40 | static int raid0_congested(void *data, int bits) | 40 | static int raid0_congested(void *data, int bits) |
| 41 | { | 41 | { |
| 42 | mddev_t *mddev = data; | 42 | mddev_t *mddev = data; |
| 43 | raid0_conf_t *conf = mddev_to_conf(mddev); | 43 | raid0_conf_t *conf = mddev->private; |
| 44 | mdk_rdev_t **devlist = conf->strip_zone[0].dev; | 44 | mdk_rdev_t **devlist = conf->devlist; |
| 45 | int i, ret = 0; | 45 | int i, ret = 0; |
| 46 | 46 | ||
| 47 | for (i = 0; i < mddev->raid_disks && !ret ; i++) { | 47 | for (i = 0; i < mddev->raid_disks && !ret ; i++) { |
| @@ -52,27 +52,60 @@ static int raid0_congested(void *data, int bits) | |||
| 52 | return ret; | 52 | return ret; |
| 53 | } | 53 | } |
| 54 | 54 | ||
| 55 | /* | ||
| 56 | * inform the user of the raid configuration | ||
| 57 | */ | ||
| 58 | static void dump_zones(mddev_t *mddev) | ||
| 59 | { | ||
| 60 | int j, k, h; | ||
| 61 | sector_t zone_size = 0; | ||
| 62 | sector_t zone_start = 0; | ||
| 63 | char b[BDEVNAME_SIZE]; | ||
| 64 | raid0_conf_t *conf = mddev->private; | ||
| 65 | printk(KERN_INFO "******* %s configuration *********\n", | ||
| 66 | mdname(mddev)); | ||
| 67 | h = 0; | ||
| 68 | for (j = 0; j < conf->nr_strip_zones; j++) { | ||
| 69 | printk(KERN_INFO "zone%d=[", j); | ||
| 70 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) | ||
| 71 | printk("%s/", | ||
| 72 | bdevname(conf->devlist[j*mddev->raid_disks | ||
| 73 | + k]->bdev, b)); | ||
| 74 | printk("]\n"); | ||
| 75 | |||
| 76 | zone_size = conf->strip_zone[j].zone_end - zone_start; | ||
| 77 | printk(KERN_INFO " zone offset=%llukb " | ||
| 78 | "device offset=%llukb size=%llukb\n", | ||
| 79 | (unsigned long long)zone_start>>1, | ||
| 80 | (unsigned long long)conf->strip_zone[j].dev_start>>1, | ||
| 81 | (unsigned long long)zone_size>>1); | ||
| 82 | zone_start = conf->strip_zone[j].zone_end; | ||
| 83 | } | ||
| 84 | printk(KERN_INFO "**********************************\n\n"); | ||
| 85 | } | ||
| 55 | 86 | ||
| 56 | static int create_strip_zones (mddev_t *mddev) | 87 | static int create_strip_zones(mddev_t *mddev) |
| 57 | { | 88 | { |
| 58 | int i, c, j; | 89 | int i, c, j, err; |
| 59 | sector_t current_start, curr_zone_start; | 90 | sector_t curr_zone_end, sectors; |
| 60 | sector_t min_spacing; | 91 | mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev; |
| 61 | raid0_conf_t *conf = mddev_to_conf(mddev); | ||
| 62 | mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; | ||
| 63 | struct strip_zone *zone; | 92 | struct strip_zone *zone; |
| 64 | int cnt; | 93 | int cnt; |
| 65 | char b[BDEVNAME_SIZE]; | 94 | char b[BDEVNAME_SIZE]; |
| 66 | 95 | raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL); | |
| 67 | /* | 96 | |
| 68 | * The number of 'same size groups' | 97 | if (!conf) |
| 69 | */ | 98 | return -ENOMEM; |
| 70 | conf->nr_strip_zones = 0; | ||
| 71 | |||
| 72 | list_for_each_entry(rdev1, &mddev->disks, same_set) { | 99 | list_for_each_entry(rdev1, &mddev->disks, same_set) { |
| 73 | printk(KERN_INFO "raid0: looking at %s\n", | 100 | printk(KERN_INFO "raid0: looking at %s\n", |
| 74 | bdevname(rdev1->bdev,b)); | 101 | bdevname(rdev1->bdev,b)); |
| 75 | c = 0; | 102 | c = 0; |
| 103 | |||
| 104 | /* round size to chunk_size */ | ||
| 105 | sectors = rdev1->sectors; | ||
| 106 | sector_div(sectors, mddev->chunk_sectors); | ||
| 107 | rdev1->sectors = sectors * mddev->chunk_sectors; | ||
| 108 | |||
| 76 | list_for_each_entry(rdev2, &mddev->disks, same_set) { | 109 | list_for_each_entry(rdev2, &mddev->disks, same_set) { |
| 77 | printk(KERN_INFO "raid0: comparing %s(%llu)", | 110 | printk(KERN_INFO "raid0: comparing %s(%llu)", |
| 78 | bdevname(rdev1->bdev,b), | 111 | bdevname(rdev1->bdev,b), |
| @@ -103,16 +136,16 @@ static int create_strip_zones (mddev_t *mddev) | |||
| 103 | } | 136 | } |
| 104 | } | 137 | } |
| 105 | printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); | 138 | printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); |
| 106 | 139 | err = -ENOMEM; | |
| 107 | conf->strip_zone = kzalloc(sizeof(struct strip_zone)* | 140 | conf->strip_zone = kzalloc(sizeof(struct strip_zone)* |
| 108 | conf->nr_strip_zones, GFP_KERNEL); | 141 | conf->nr_strip_zones, GFP_KERNEL); |
| 109 | if (!conf->strip_zone) | 142 | if (!conf->strip_zone) |
| 110 | return 1; | 143 | goto abort; |
| 111 | conf->devlist = kzalloc(sizeof(mdk_rdev_t*)* | 144 | conf->devlist = kzalloc(sizeof(mdk_rdev_t*)* |
| 112 | conf->nr_strip_zones*mddev->raid_disks, | 145 | conf->nr_strip_zones*mddev->raid_disks, |
| 113 | GFP_KERNEL); | 146 | GFP_KERNEL); |
| 114 | if (!conf->devlist) | 147 | if (!conf->devlist) |
| 115 | return 1; | 148 | goto abort; |
| 116 | 149 | ||
| 117 | /* The first zone must contain all devices, so here we check that | 150 | /* The first zone must contain all devices, so here we check that |
| 118 | * there is a proper alignment of slots to devices and find them all | 151 | * there is a proper alignment of slots to devices and find them all |
| @@ -120,7 +153,8 @@ static int create_strip_zones (mddev_t *mddev) | |||
| 120 | zone = &conf->strip_zone[0]; | 153 | zone = &conf->strip_zone[0]; |
| 121 | cnt = 0; | 154 | cnt = 0; |
| 122 | smallest = NULL; | 155 | smallest = NULL; |
| 123 | zone->dev = conf->devlist; | 156 | dev = conf->devlist; |
| 157 | err = -EINVAL; | ||
| 124 | list_for_each_entry(rdev1, &mddev->disks, same_set) { | 158 | list_for_each_entry(rdev1, &mddev->disks, same_set) { |
| 125 | int j = rdev1->raid_disk; | 159 | int j = rdev1->raid_disk; |
| 126 | 160 | ||
| @@ -129,12 +163,12 @@ static int create_strip_zones (mddev_t *mddev) | |||
| 129 | "aborting!\n", j); | 163 | "aborting!\n", j); |
| 130 | goto abort; | 164 | goto abort; |
| 131 | } | 165 | } |
| 132 | if (zone->dev[j]) { | 166 | if (dev[j]) { |
| 133 | printk(KERN_ERR "raid0: multiple devices for %d - " | 167 | printk(KERN_ERR "raid0: multiple devices for %d - " |
| 134 | "aborting!\n", j); | 168 | "aborting!\n", j); |
| 135 | goto abort; | 169 | goto abort; |
| 136 | } | 170 | } |
| 137 | zone->dev[j] = rdev1; | 171 | dev[j] = rdev1; |
| 138 | 172 | ||
| 139 | blk_queue_stack_limits(mddev->queue, | 173 | blk_queue_stack_limits(mddev->queue, |
| 140 | rdev1->bdev->bd_disk->queue); | 174 | rdev1->bdev->bd_disk->queue); |
| @@ -144,7 +178,7 @@ static int create_strip_zones (mddev_t *mddev) | |||
| 144 | */ | 178 | */ |
| 145 | 179 | ||
| 146 | if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && | 180 | if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && |
| 147 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 181 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
| 148 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 182 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
| 149 | 183 | ||
| 150 | if (!smallest || (rdev1->sectors < smallest->sectors)) | 184 | if (!smallest || (rdev1->sectors < smallest->sectors)) |
| @@ -157,34 +191,32 @@ static int create_strip_zones (mddev_t *mddev) | |||
| 157 | goto abort; | 191 | goto abort; |
| 158 | } | 192 | } |
| 159 | zone->nb_dev = cnt; | 193 | zone->nb_dev = cnt; |
| 160 | zone->sectors = smallest->sectors * cnt; | 194 | zone->zone_end = smallest->sectors * cnt; |
| 161 | zone->zone_start = 0; | ||
| 162 | 195 | ||
| 163 | current_start = smallest->sectors; | 196 | curr_zone_end = zone->zone_end; |
| 164 | curr_zone_start = zone->sectors; | ||
| 165 | 197 | ||
| 166 | /* now do the other zones */ | 198 | /* now do the other zones */ |
| 167 | for (i = 1; i < conf->nr_strip_zones; i++) | 199 | for (i = 1; i < conf->nr_strip_zones; i++) |
| 168 | { | 200 | { |
| 169 | zone = conf->strip_zone + i; | 201 | zone = conf->strip_zone + i; |
| 170 | zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks; | 202 | dev = conf->devlist + i * mddev->raid_disks; |
| 171 | 203 | ||
| 172 | printk(KERN_INFO "raid0: zone %d\n", i); | 204 | printk(KERN_INFO "raid0: zone %d\n", i); |
| 173 | zone->dev_start = current_start; | 205 | zone->dev_start = smallest->sectors; |
| 174 | smallest = NULL; | 206 | smallest = NULL; |
| 175 | c = 0; | 207 | c = 0; |
| 176 | 208 | ||
| 177 | for (j=0; j<cnt; j++) { | 209 | for (j=0; j<cnt; j++) { |
| 178 | char b[BDEVNAME_SIZE]; | 210 | char b[BDEVNAME_SIZE]; |
| 179 | rdev = conf->strip_zone[0].dev[j]; | 211 | rdev = conf->devlist[j]; |
| 180 | printk(KERN_INFO "raid0: checking %s ...", | 212 | printk(KERN_INFO "raid0: checking %s ...", |
| 181 | bdevname(rdev->bdev, b)); | 213 | bdevname(rdev->bdev, b)); |
| 182 | if (rdev->sectors <= current_start) { | 214 | if (rdev->sectors <= zone->dev_start) { |
| 183 | printk(KERN_INFO " nope.\n"); | 215 | printk(KERN_INFO " nope.\n"); |
| 184 | continue; | 216 | continue; |
| 185 | } | 217 | } |
| 186 | printk(KERN_INFO " contained as device %d\n", c); | 218 | printk(KERN_INFO " contained as device %d\n", c); |
| 187 | zone->dev[c] = rdev; | 219 | dev[c] = rdev; |
| 188 | c++; | 220 | c++; |
| 189 | if (!smallest || rdev->sectors < smallest->sectors) { | 221 | if (!smallest || rdev->sectors < smallest->sectors) { |
| 190 | smallest = rdev; | 222 | smallest = rdev; |
| @@ -194,47 +226,39 @@ static int create_strip_zones (mddev_t *mddev) | |||
| 194 | } | 226 | } |
| 195 | 227 | ||
| 196 | zone->nb_dev = c; | 228 | zone->nb_dev = c; |
| 197 | zone->sectors = (smallest->sectors - current_start) * c; | 229 | sectors = (smallest->sectors - zone->dev_start) * c; |
| 198 | printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", | 230 | printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", |
| 199 | zone->nb_dev, (unsigned long long)zone->sectors); | 231 | zone->nb_dev, (unsigned long long)sectors); |
| 200 | 232 | ||
| 201 | zone->zone_start = curr_zone_start; | 233 | curr_zone_end += sectors; |
| 202 | curr_zone_start += zone->sectors; | 234 | zone->zone_end = curr_zone_end; |
| 203 | 235 | ||
| 204 | current_start = smallest->sectors; | ||
| 205 | printk(KERN_INFO "raid0: current zone start: %llu\n", | 236 | printk(KERN_INFO "raid0: current zone start: %llu\n", |
| 206 | (unsigned long long)current_start); | 237 | (unsigned long long)smallest->sectors); |
| 207 | } | ||
| 208 | |||
| 209 | /* Now find appropriate hash spacing. | ||
| 210 | * We want a number which causes most hash entries to cover | ||
| 211 | * at most two strips, but the hash table must be at most | ||
| 212 | * 1 PAGE. We choose the smallest strip, or contiguous collection | ||
| 213 | * of strips, that has big enough size. We never consider the last | ||
| 214 | * strip though as it's size has no bearing on the efficacy of the hash | ||
| 215 | * table. | ||
| 216 | */ | ||
| 217 | conf->spacing = curr_zone_start; | ||
| 218 | min_spacing = curr_zone_start; | ||
| 219 | sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*)); | ||
| 220 | for (i=0; i < conf->nr_strip_zones-1; i++) { | ||
| 221 | sector_t s = 0; | ||
| 222 | for (j = i; j < conf->nr_strip_zones - 1 && | ||
| 223 | s < min_spacing; j++) | ||
| 224 | s += conf->strip_zone[j].sectors; | ||
| 225 | if (s >= min_spacing && s < conf->spacing) | ||
| 226 | conf->spacing = s; | ||
| 227 | } | 238 | } |
| 228 | |||
| 229 | mddev->queue->unplug_fn = raid0_unplug; | 239 | mddev->queue->unplug_fn = raid0_unplug; |
| 230 | |||
| 231 | mddev->queue->backing_dev_info.congested_fn = raid0_congested; | 240 | mddev->queue->backing_dev_info.congested_fn = raid0_congested; |
| 232 | mddev->queue->backing_dev_info.congested_data = mddev; | 241 | mddev->queue->backing_dev_info.congested_data = mddev; |
| 233 | 242 | ||
| 243 | /* | ||
| 244 | * now since we have the hard sector sizes, we can make sure | ||
| 245 | * chunk size is a multiple of that sector size | ||
| 246 | */ | ||
| 247 | if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { | ||
| 248 | printk(KERN_ERR "%s chunk_size of %d not valid\n", | ||
| 249 | mdname(mddev), | ||
| 250 | mddev->chunk_sectors << 9); | ||
| 251 | goto abort; | ||
| 252 | } | ||
| 234 | printk(KERN_INFO "raid0: done.\n"); | 253 | printk(KERN_INFO "raid0: done.\n"); |
| 254 | mddev->private = conf; | ||
| 235 | return 0; | 255 | return 0; |
| 236 | abort: | 256 | abort: |
| 237 | return 1; | 257 | kfree(conf->strip_zone); |
| 258 | kfree(conf->devlist); | ||
| 259 | kfree(conf); | ||
| 260 | mddev->private = NULL; | ||
| 261 | return err; | ||
| 238 | } | 262 | } |
| 239 | 263 | ||
| 240 | /** | 264 | /** |
| @@ -252,10 +276,15 @@ static int raid0_mergeable_bvec(struct request_queue *q, | |||
| 252 | mddev_t *mddev = q->queuedata; | 276 | mddev_t *mddev = q->queuedata; |
| 253 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 277 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
| 254 | int max; | 278 | int max; |
| 255 | unsigned int chunk_sectors = mddev->chunk_size >> 9; | 279 | unsigned int chunk_sectors = mddev->chunk_sectors; |
| 256 | unsigned int bio_sectors = bvm->bi_size >> 9; | 280 | unsigned int bio_sectors = bvm->bi_size >> 9; |
| 257 | 281 | ||
| 258 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; | 282 | if (is_power_of_2(chunk_sectors)) |
| 283 | max = (chunk_sectors - ((sector & (chunk_sectors-1)) | ||
| 284 | + bio_sectors)) << 9; | ||
| 285 | else | ||
| 286 | max = (chunk_sectors - (sector_div(sector, chunk_sectors) | ||
| 287 | + bio_sectors)) << 9; | ||
| 259 | if (max < 0) max = 0; /* bio_add cannot handle a negative return */ | 288 | if (max < 0) max = 0; /* bio_add cannot handle a negative return */ |
| 260 | if (max <= biovec->bv_len && bio_sectors == 0) | 289 | if (max <= biovec->bv_len && bio_sectors == 0) |
| 261 | return biovec->bv_len; | 290 | return biovec->bv_len; |
| @@ -277,84 +306,28 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
| 277 | return array_sectors; | 306 | return array_sectors; |
| 278 | } | 307 | } |
| 279 | 308 | ||
| 280 | static int raid0_run (mddev_t *mddev) | 309 | static int raid0_run(mddev_t *mddev) |
| 281 | { | 310 | { |
| 282 | unsigned cur=0, i=0, nb_zone; | 311 | int ret; |
| 283 | s64 sectors; | ||
| 284 | raid0_conf_t *conf; | ||
| 285 | 312 | ||
| 286 | if (mddev->chunk_size == 0) { | 313 | if (mddev->chunk_sectors == 0) { |
| 287 | printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); | 314 | printk(KERN_ERR "md/raid0: chunk size must be set.\n"); |
| 288 | return -EINVAL; | 315 | return -EINVAL; |
| 289 | } | 316 | } |
| 290 | printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n", | 317 | if (md_check_no_bitmap(mddev)) |
| 291 | mdname(mddev), | 318 | return -EINVAL; |
| 292 | mddev->chunk_size >> 9, | 319 | blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors); |
| 293 | (mddev->chunk_size>>1)-1); | ||
| 294 | blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9); | ||
| 295 | blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1); | ||
| 296 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; | 320 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; |
| 297 | 321 | ||
| 298 | conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL); | 322 | ret = create_strip_zones(mddev); |
| 299 | if (!conf) | 323 | if (ret < 0) |
| 300 | goto out; | 324 | return ret; |
| 301 | mddev->private = (void *)conf; | ||
| 302 | |||
| 303 | conf->strip_zone = NULL; | ||
| 304 | conf->devlist = NULL; | ||
| 305 | if (create_strip_zones (mddev)) | ||
| 306 | goto out_free_conf; | ||
| 307 | 325 | ||
| 308 | /* calculate array device size */ | 326 | /* calculate array device size */ |
| 309 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); | 327 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); |
| 310 | 328 | ||
| 311 | printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", | 329 | printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", |
| 312 | (unsigned long long)mddev->array_sectors); | 330 | (unsigned long long)mddev->array_sectors); |
| 313 | printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n", | ||
| 314 | (unsigned long long)conf->spacing); | ||
| 315 | { | ||
| 316 | sector_t s = raid0_size(mddev, 0, 0); | ||
| 317 | sector_t space = conf->spacing; | ||
| 318 | int round; | ||
| 319 | conf->sector_shift = 0; | ||
| 320 | if (sizeof(sector_t) > sizeof(u32)) { | ||
| 321 | /*shift down space and s so that sector_div will work */ | ||
| 322 | while (space > (sector_t) (~(u32)0)) { | ||
| 323 | s >>= 1; | ||
| 324 | space >>= 1; | ||
| 325 | s += 1; /* force round-up */ | ||
| 326 | conf->sector_shift++; | ||
| 327 | } | ||
| 328 | } | ||
| 329 | round = sector_div(s, (u32)space) ? 1 : 0; | ||
| 330 | nb_zone = s + round; | ||
| 331 | } | ||
| 332 | printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone); | ||
| 333 | |||
| 334 | printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n", | ||
| 335 | nb_zone*sizeof(struct strip_zone*)); | ||
| 336 | conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL); | ||
| 337 | if (!conf->hash_table) | ||
| 338 | goto out_free_conf; | ||
| 339 | sectors = conf->strip_zone[cur].sectors; | ||
| 340 | |||
| 341 | conf->hash_table[0] = conf->strip_zone + cur; | ||
| 342 | for (i=1; i< nb_zone; i++) { | ||
| 343 | while (sectors <= conf->spacing) { | ||
| 344 | cur++; | ||
| 345 | sectors += conf->strip_zone[cur].sectors; | ||
| 346 | } | ||
| 347 | sectors -= conf->spacing; | ||
| 348 | conf->hash_table[i] = conf->strip_zone + cur; | ||
| 349 | } | ||
| 350 | if (conf->sector_shift) { | ||
| 351 | conf->spacing >>= conf->sector_shift; | ||
| 352 | /* round spacing up so when we divide by it, we | ||
| 353 | * err on the side of too-low, which is safest | ||
| 354 | */ | ||
| 355 | conf->spacing++; | ||
| 356 | } | ||
| 357 | |||
| 358 | /* calculate the max read-ahead size. | 331 | /* calculate the max read-ahead size. |
| 359 | * For read-ahead of large files to be effective, we need to | 332 | * For read-ahead of large files to be effective, we need to |
| 360 | * readahead at least twice a whole stripe. i.e. number of devices | 333 | * readahead at least twice a whole stripe. i.e. number of devices |
| @@ -365,48 +338,107 @@ static int raid0_run (mddev_t *mddev) | |||
| 365 | * chunksize should be used in that case. | 338 | * chunksize should be used in that case. |
| 366 | */ | 339 | */ |
| 367 | { | 340 | { |
| 368 | int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE; | 341 | int stripe = mddev->raid_disks * |
| 342 | (mddev->chunk_sectors << 9) / PAGE_SIZE; | ||
| 369 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) | 343 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) |
| 370 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; | 344 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; |
| 371 | } | 345 | } |
| 372 | 346 | ||
| 373 | |||
| 374 | blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); | 347 | blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); |
| 348 | dump_zones(mddev); | ||
| 375 | return 0; | 349 | return 0; |
| 350 | } | ||
| 376 | 351 | ||
| 377 | out_free_conf: | 352 | static int raid0_stop(mddev_t *mddev) |
| 353 | { | ||
| 354 | raid0_conf_t *conf = mddev->private; | ||
| 355 | |||
| 356 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | ||
| 378 | kfree(conf->strip_zone); | 357 | kfree(conf->strip_zone); |
| 379 | kfree(conf->devlist); | 358 | kfree(conf->devlist); |
| 380 | kfree(conf); | 359 | kfree(conf); |
| 381 | mddev->private = NULL; | 360 | mddev->private = NULL; |
| 382 | out: | 361 | return 0; |
| 383 | return -ENOMEM; | ||
| 384 | } | 362 | } |
| 385 | 363 | ||
| 386 | static int raid0_stop (mddev_t *mddev) | 364 | /* Find the zone which holds a particular offset |
| 365 | * Update *sectorp to be an offset in that zone | ||
| 366 | */ | ||
| 367 | static struct strip_zone *find_zone(struct raid0_private_data *conf, | ||
| 368 | sector_t *sectorp) | ||
| 387 | { | 369 | { |
| 388 | raid0_conf_t *conf = mddev_to_conf(mddev); | 370 | int i; |
| 371 | struct strip_zone *z = conf->strip_zone; | ||
| 372 | sector_t sector = *sectorp; | ||
| 373 | |||
| 374 | for (i = 0; i < conf->nr_strip_zones; i++) | ||
| 375 | if (sector < z[i].zone_end) { | ||
| 376 | if (i) | ||
| 377 | *sectorp = sector - z[i-1].zone_end; | ||
| 378 | return z + i; | ||
| 379 | } | ||
| 380 | BUG(); | ||
| 381 | } | ||
| 389 | 382 | ||
| 390 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 383 | /* |
| 391 | kfree(conf->hash_table); | 384 | * remaps the bio to the target device. we separate two flows. |
| 392 | conf->hash_table = NULL; | 385 | * power 2 flow and a general flow for the sake of perfromance |
| 393 | kfree(conf->strip_zone); | 386 | */ |
| 394 | conf->strip_zone = NULL; | 387 | static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, |
| 395 | kfree(conf); | 388 | sector_t sector, sector_t *sector_offset) |
| 396 | mddev->private = NULL; | 389 | { |
| 390 | unsigned int sect_in_chunk; | ||
| 391 | sector_t chunk; | ||
| 392 | raid0_conf_t *conf = mddev->private; | ||
| 393 | unsigned int chunk_sects = mddev->chunk_sectors; | ||
| 394 | |||
| 395 | if (is_power_of_2(chunk_sects)) { | ||
| 396 | int chunksect_bits = ffz(~chunk_sects); | ||
| 397 | /* find the sector offset inside the chunk */ | ||
| 398 | sect_in_chunk = sector & (chunk_sects - 1); | ||
| 399 | sector >>= chunksect_bits; | ||
| 400 | /* chunk in zone */ | ||
| 401 | chunk = *sector_offset; | ||
| 402 | /* quotient is the chunk in real device*/ | ||
| 403 | sector_div(chunk, zone->nb_dev << chunksect_bits); | ||
| 404 | } else{ | ||
| 405 | sect_in_chunk = sector_div(sector, chunk_sects); | ||
| 406 | chunk = *sector_offset; | ||
| 407 | sector_div(chunk, chunk_sects * zone->nb_dev); | ||
| 408 | } | ||
| 409 | /* | ||
| 410 | * position the bio over the real device | ||
| 411 | * real sector = chunk in device + starting of zone | ||
| 412 | * + the position in the chunk | ||
| 413 | */ | ||
| 414 | *sector_offset = (chunk * chunk_sects) + sect_in_chunk; | ||
| 415 | return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks | ||
| 416 | + sector_div(sector, zone->nb_dev)]; | ||
| 417 | } | ||
| 397 | 418 | ||
| 398 | return 0; | 419 | /* |
| 420 | * Is io distribute over 1 or more chunks ? | ||
| 421 | */ | ||
| 422 | static inline int is_io_in_chunk_boundary(mddev_t *mddev, | ||
| 423 | unsigned int chunk_sects, struct bio *bio) | ||
| 424 | { | ||
| 425 | if (likely(is_power_of_2(chunk_sects))) { | ||
| 426 | return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) | ||
| 427 | + (bio->bi_size >> 9)); | ||
| 428 | } else{ | ||
| 429 | sector_t sector = bio->bi_sector; | ||
| 430 | return chunk_sects >= (sector_div(sector, chunk_sects) | ||
| 431 | + (bio->bi_size >> 9)); | ||
| 432 | } | ||
| 399 | } | 433 | } |
| 400 | 434 | ||
| 401 | static int raid0_make_request (struct request_queue *q, struct bio *bio) | 435 | static int raid0_make_request(struct request_queue *q, struct bio *bio) |
| 402 | { | 436 | { |
| 403 | mddev_t *mddev = q->queuedata; | 437 | mddev_t *mddev = q->queuedata; |
| 404 | unsigned int sect_in_chunk, chunksect_bits, chunk_sects; | 438 | unsigned int chunk_sects; |
| 405 | raid0_conf_t *conf = mddev_to_conf(mddev); | 439 | sector_t sector_offset; |
| 406 | struct strip_zone *zone; | 440 | struct strip_zone *zone; |
| 407 | mdk_rdev_t *tmp_dev; | 441 | mdk_rdev_t *tmp_dev; |
| 408 | sector_t chunk; | ||
| 409 | sector_t sector, rsect; | ||
| 410 | const int rw = bio_data_dir(bio); | 442 | const int rw = bio_data_dir(bio); |
| 411 | int cpu; | 443 | int cpu; |
| 412 | 444 | ||
| @@ -421,11 +453,9 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) | |||
| 421 | bio_sectors(bio)); | 453 | bio_sectors(bio)); |
| 422 | part_stat_unlock(); | 454 | part_stat_unlock(); |
| 423 | 455 | ||
| 424 | chunk_sects = mddev->chunk_size >> 9; | 456 | chunk_sects = mddev->chunk_sectors; |
| 425 | chunksect_bits = ffz(~chunk_sects); | 457 | if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { |
| 426 | sector = bio->bi_sector; | 458 | sector_t sector = bio->bi_sector; |
| 427 | |||
| 428 | if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) { | ||
| 429 | struct bio_pair *bp; | 459 | struct bio_pair *bp; |
| 430 | /* Sanity check -- queue functions should prevent this happening */ | 460 | /* Sanity check -- queue functions should prevent this happening */ |
| 431 | if (bio->bi_vcnt != 1 || | 461 | if (bio->bi_vcnt != 1 || |
| @@ -434,7 +464,12 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) | |||
| 434 | /* This is a one page bio that upper layers | 464 | /* This is a one page bio that upper layers |
| 435 | * refuse to split for us, so we need to split it. | 465 | * refuse to split for us, so we need to split it. |
| 436 | */ | 466 | */ |
| 437 | bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1))); | 467 | if (likely(is_power_of_2(chunk_sects))) |
| 468 | bp = bio_split(bio, chunk_sects - (sector & | ||
| 469 | (chunk_sects-1))); | ||
| 470 | else | ||
| 471 | bp = bio_split(bio, chunk_sects - | ||
| 472 | sector_div(sector, chunk_sects)); | ||
| 438 | if (raid0_make_request(q, &bp->bio1)) | 473 | if (raid0_make_request(q, &bp->bio1)) |
| 439 | generic_make_request(&bp->bio1); | 474 | generic_make_request(&bp->bio1); |
| 440 | if (raid0_make_request(q, &bp->bio2)) | 475 | if (raid0_make_request(q, &bp->bio2)) |
| @@ -443,34 +478,14 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) | |||
| 443 | bio_pair_release(bp); | 478 | bio_pair_release(bp); |
| 444 | return 0; | 479 | return 0; |
| 445 | } | 480 | } |
| 446 | |||
| 447 | |||
| 448 | { | ||
| 449 | sector_t x = sector >> conf->sector_shift; | ||
| 450 | sector_div(x, (u32)conf->spacing); | ||
| 451 | zone = conf->hash_table[x]; | ||
| 452 | } | ||
| 453 | 481 | ||
| 454 | while (sector >= zone->zone_start + zone->sectors) | 482 | sector_offset = bio->bi_sector; |
| 455 | zone++; | 483 | zone = find_zone(mddev->private, §or_offset); |
| 456 | 484 | tmp_dev = map_sector(mddev, zone, bio->bi_sector, | |
| 457 | sect_in_chunk = bio->bi_sector & (chunk_sects - 1); | 485 | §or_offset); |
| 458 | |||
| 459 | |||
| 460 | { | ||
| 461 | sector_t x = (sector - zone->zone_start) >> chunksect_bits; | ||
| 462 | |||
| 463 | sector_div(x, zone->nb_dev); | ||
| 464 | chunk = x; | ||
| 465 | |||
| 466 | x = sector >> chunksect_bits; | ||
| 467 | tmp_dev = zone->dev[sector_div(x, zone->nb_dev)]; | ||
| 468 | } | ||
| 469 | rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk; | ||
| 470 | |||
| 471 | bio->bi_bdev = tmp_dev->bdev; | 486 | bio->bi_bdev = tmp_dev->bdev; |
| 472 | bio->bi_sector = rsect + tmp_dev->data_offset; | 487 | bio->bi_sector = sector_offset + zone->dev_start + |
| 473 | 488 | tmp_dev->data_offset; | |
| 474 | /* | 489 | /* |
| 475 | * Let the main block layer submit the IO and resolve recursion: | 490 | * Let the main block layer submit the IO and resolve recursion: |
| 476 | */ | 491 | */ |
| @@ -485,31 +500,35 @@ bad_map: | |||
| 485 | return 0; | 500 | return 0; |
| 486 | } | 501 | } |
| 487 | 502 | ||
| 488 | static void raid0_status (struct seq_file *seq, mddev_t *mddev) | 503 | static void raid0_status(struct seq_file *seq, mddev_t *mddev) |
| 489 | { | 504 | { |
| 490 | #undef MD_DEBUG | 505 | #undef MD_DEBUG |
| 491 | #ifdef MD_DEBUG | 506 | #ifdef MD_DEBUG |
| 492 | int j, k, h; | 507 | int j, k, h; |
| 493 | char b[BDEVNAME_SIZE]; | 508 | char b[BDEVNAME_SIZE]; |
| 494 | raid0_conf_t *conf = mddev_to_conf(mddev); | 509 | raid0_conf_t *conf = mddev->private; |
| 495 | 510 | ||
| 511 | sector_t zone_size; | ||
| 512 | sector_t zone_start = 0; | ||
| 496 | h = 0; | 513 | h = 0; |
| 514 | |||
| 497 | for (j = 0; j < conf->nr_strip_zones; j++) { | 515 | for (j = 0; j < conf->nr_strip_zones; j++) { |
| 498 | seq_printf(seq, " z%d", j); | 516 | seq_printf(seq, " z%d", j); |
| 499 | if (conf->hash_table[h] == conf->strip_zone+j) | ||
| 500 | seq_printf(seq, "(h%d)", h++); | ||
| 501 | seq_printf(seq, "=["); | 517 | seq_printf(seq, "=["); |
| 502 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) | 518 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) |
| 503 | seq_printf(seq, "%s/", bdevname( | 519 | seq_printf(seq, "%s/", bdevname( |
| 504 | conf->strip_zone[j].dev[k]->bdev,b)); | 520 | conf->devlist[j*mddev->raid_disks + k] |
| 505 | 521 | ->bdev, b)); | |
| 506 | seq_printf(seq, "] zs=%d ds=%d s=%d\n", | 522 | |
| 507 | conf->strip_zone[j].zone_start, | 523 | zone_size = conf->strip_zone[j].zone_end - zone_start; |
| 508 | conf->strip_zone[j].dev_start, | 524 | seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n", |
| 509 | conf->strip_zone[j].sectors); | 525 | (unsigned long long)zone_start>>1, |
| 526 | (unsigned long long)conf->strip_zone[j].dev_start>>1, | ||
| 527 | (unsigned long long)zone_size>>1); | ||
| 528 | zone_start = conf->strip_zone[j].zone_end; | ||
| 510 | } | 529 | } |
| 511 | #endif | 530 | #endif |
| 512 | seq_printf(seq, " %dk chunks", mddev->chunk_size/1024); | 531 | seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2); |
| 513 | return; | 532 | return; |
| 514 | } | 533 | } |
| 515 | 534 | ||
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 824b12eb1d4f..91f8e876ee64 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h | |||
| @@ -3,26 +3,18 @@ | |||
| 3 | 3 | ||
| 4 | struct strip_zone | 4 | struct strip_zone |
| 5 | { | 5 | { |
| 6 | sector_t zone_start; /* Zone offset in md_dev (in sectors) */ | 6 | sector_t zone_end; /* Start of the next zone (in sectors) */ |
| 7 | sector_t dev_start; /* Zone offset in real dev (in sectors) */ | 7 | sector_t dev_start; /* Zone offset in real dev (in sectors) */ |
| 8 | sector_t sectors; /* Zone size in sectors */ | ||
| 9 | int nb_dev; /* # of devices attached to the zone */ | 8 | int nb_dev; /* # of devices attached to the zone */ |
| 10 | mdk_rdev_t **dev; /* Devices attached to the zone */ | ||
| 11 | }; | 9 | }; |
| 12 | 10 | ||
| 13 | struct raid0_private_data | 11 | struct raid0_private_data |
| 14 | { | 12 | { |
| 15 | struct strip_zone **hash_table; /* Table of indexes into strip_zone */ | ||
| 16 | struct strip_zone *strip_zone; | 13 | struct strip_zone *strip_zone; |
| 17 | mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ | 14 | mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ |
| 18 | int nr_strip_zones; | 15 | int nr_strip_zones; |
| 19 | |||
| 20 | sector_t spacing; | ||
| 21 | int sector_shift; /* shift this before divide by spacing */ | ||
| 22 | }; | 16 | }; |
| 23 | 17 | ||
| 24 | typedef struct raid0_private_data raid0_conf_t; | 18 | typedef struct raid0_private_data raid0_conf_t; |
| 25 | 19 | ||
| 26 | #define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) | ||
| 27 | |||
| 28 | #endif | 20 | #endif |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 36df9109cde1..89939a7aef57 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -182,7 +182,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) | |||
| 182 | 182 | ||
| 183 | static void free_r1bio(r1bio_t *r1_bio) | 183 | static void free_r1bio(r1bio_t *r1_bio) |
| 184 | { | 184 | { |
| 185 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 185 | conf_t *conf = r1_bio->mddev->private; |
| 186 | 186 | ||
| 187 | /* | 187 | /* |
| 188 | * Wake up any possible resync thread that waits for the device | 188 | * Wake up any possible resync thread that waits for the device |
| @@ -196,7 +196,7 @@ static void free_r1bio(r1bio_t *r1_bio) | |||
| 196 | 196 | ||
| 197 | static void put_buf(r1bio_t *r1_bio) | 197 | static void put_buf(r1bio_t *r1_bio) |
| 198 | { | 198 | { |
| 199 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 199 | conf_t *conf = r1_bio->mddev->private; |
| 200 | int i; | 200 | int i; |
| 201 | 201 | ||
| 202 | for (i=0; i<conf->raid_disks; i++) { | 202 | for (i=0; i<conf->raid_disks; i++) { |
| @@ -214,7 +214,7 @@ static void reschedule_retry(r1bio_t *r1_bio) | |||
| 214 | { | 214 | { |
| 215 | unsigned long flags; | 215 | unsigned long flags; |
| 216 | mddev_t *mddev = r1_bio->mddev; | 216 | mddev_t *mddev = r1_bio->mddev; |
| 217 | conf_t *conf = mddev_to_conf(mddev); | 217 | conf_t *conf = mddev->private; |
| 218 | 218 | ||
| 219 | spin_lock_irqsave(&conf->device_lock, flags); | 219 | spin_lock_irqsave(&conf->device_lock, flags); |
| 220 | list_add(&r1_bio->retry_list, &conf->retry_list); | 220 | list_add(&r1_bio->retry_list, &conf->retry_list); |
| @@ -253,7 +253,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio) | |||
| 253 | */ | 253 | */ |
| 254 | static inline void update_head_pos(int disk, r1bio_t *r1_bio) | 254 | static inline void update_head_pos(int disk, r1bio_t *r1_bio) |
| 255 | { | 255 | { |
| 256 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 256 | conf_t *conf = r1_bio->mddev->private; |
| 257 | 257 | ||
| 258 | conf->mirrors[disk].head_position = | 258 | conf->mirrors[disk].head_position = |
| 259 | r1_bio->sector + (r1_bio->sectors); | 259 | r1_bio->sector + (r1_bio->sectors); |
| @@ -264,7 +264,7 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
| 264 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 264 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| 265 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 265 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
| 266 | int mirror; | 266 | int mirror; |
| 267 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 267 | conf_t *conf = r1_bio->mddev->private; |
| 268 | 268 | ||
| 269 | mirror = r1_bio->read_disk; | 269 | mirror = r1_bio->read_disk; |
| 270 | /* | 270 | /* |
| @@ -309,7 +309,7 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
| 309 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 309 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| 310 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 310 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
| 311 | int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); | 311 | int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); |
| 312 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 312 | conf_t *conf = r1_bio->mddev->private; |
| 313 | struct bio *to_put = NULL; | 313 | struct bio *to_put = NULL; |
| 314 | 314 | ||
| 315 | 315 | ||
| @@ -541,7 +541,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
| 541 | 541 | ||
| 542 | static void unplug_slaves(mddev_t *mddev) | 542 | static void unplug_slaves(mddev_t *mddev) |
| 543 | { | 543 | { |
| 544 | conf_t *conf = mddev_to_conf(mddev); | 544 | conf_t *conf = mddev->private; |
| 545 | int i; | 545 | int i; |
| 546 | 546 | ||
| 547 | rcu_read_lock(); | 547 | rcu_read_lock(); |
| @@ -573,7 +573,7 @@ static void raid1_unplug(struct request_queue *q) | |||
| 573 | static int raid1_congested(void *data, int bits) | 573 | static int raid1_congested(void *data, int bits) |
| 574 | { | 574 | { |
| 575 | mddev_t *mddev = data; | 575 | mddev_t *mddev = data; |
| 576 | conf_t *conf = mddev_to_conf(mddev); | 576 | conf_t *conf = mddev->private; |
| 577 | int i, ret = 0; | 577 | int i, ret = 0; |
| 578 | 578 | ||
| 579 | rcu_read_lock(); | 579 | rcu_read_lock(); |
| @@ -772,7 +772,7 @@ do_sync_io: | |||
| 772 | static int make_request(struct request_queue *q, struct bio * bio) | 772 | static int make_request(struct request_queue *q, struct bio * bio) |
| 773 | { | 773 | { |
| 774 | mddev_t *mddev = q->queuedata; | 774 | mddev_t *mddev = q->queuedata; |
| 775 | conf_t *conf = mddev_to_conf(mddev); | 775 | conf_t *conf = mddev->private; |
| 776 | mirror_info_t *mirror; | 776 | mirror_info_t *mirror; |
| 777 | r1bio_t *r1_bio; | 777 | r1bio_t *r1_bio; |
| 778 | struct bio *read_bio; | 778 | struct bio *read_bio; |
| @@ -991,7 +991,7 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
| 991 | 991 | ||
| 992 | static void status(struct seq_file *seq, mddev_t *mddev) | 992 | static void status(struct seq_file *seq, mddev_t *mddev) |
| 993 | { | 993 | { |
| 994 | conf_t *conf = mddev_to_conf(mddev); | 994 | conf_t *conf = mddev->private; |
| 995 | int i; | 995 | int i; |
| 996 | 996 | ||
| 997 | seq_printf(seq, " [%d/%d] [", conf->raid_disks, | 997 | seq_printf(seq, " [%d/%d] [", conf->raid_disks, |
| @@ -1010,7 +1010,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) | |||
| 1010 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | 1010 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) |
| 1011 | { | 1011 | { |
| 1012 | char b[BDEVNAME_SIZE]; | 1012 | char b[BDEVNAME_SIZE]; |
| 1013 | conf_t *conf = mddev_to_conf(mddev); | 1013 | conf_t *conf = mddev->private; |
| 1014 | 1014 | ||
| 1015 | /* | 1015 | /* |
| 1016 | * If it is not operational, then we have already marked it as dead | 1016 | * If it is not operational, then we have already marked it as dead |
| @@ -1130,7 +1130,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1130 | * a one page request is never in violation. | 1130 | * a one page request is never in violation. |
| 1131 | */ | 1131 | */ |
| 1132 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 1132 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && |
| 1133 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 1133 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
| 1134 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 1134 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
| 1135 | 1135 | ||
| 1136 | p->head_position = 0; | 1136 | p->head_position = 0; |
| @@ -1214,7 +1214,7 @@ static void end_sync_write(struct bio *bio, int error) | |||
| 1214 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 1214 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| 1215 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 1215 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
| 1216 | mddev_t *mddev = r1_bio->mddev; | 1216 | mddev_t *mddev = r1_bio->mddev; |
| 1217 | conf_t *conf = mddev_to_conf(mddev); | 1217 | conf_t *conf = mddev->private; |
| 1218 | int i; | 1218 | int i; |
| 1219 | int mirror=0; | 1219 | int mirror=0; |
| 1220 | 1220 | ||
| @@ -1248,7 +1248,7 @@ static void end_sync_write(struct bio *bio, int error) | |||
| 1248 | 1248 | ||
| 1249 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | 1249 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) |
| 1250 | { | 1250 | { |
| 1251 | conf_t *conf = mddev_to_conf(mddev); | 1251 | conf_t *conf = mddev->private; |
| 1252 | int i; | 1252 | int i; |
| 1253 | int disks = conf->raid_disks; | 1253 | int disks = conf->raid_disks; |
| 1254 | struct bio *bio, *wbio; | 1254 | struct bio *bio, *wbio; |
| @@ -1562,7 +1562,7 @@ static void raid1d(mddev_t *mddev) | |||
| 1562 | r1bio_t *r1_bio; | 1562 | r1bio_t *r1_bio; |
| 1563 | struct bio *bio; | 1563 | struct bio *bio; |
| 1564 | unsigned long flags; | 1564 | unsigned long flags; |
| 1565 | conf_t *conf = mddev_to_conf(mddev); | 1565 | conf_t *conf = mddev->private; |
| 1566 | struct list_head *head = &conf->retry_list; | 1566 | struct list_head *head = &conf->retry_list; |
| 1567 | int unplug=0; | 1567 | int unplug=0; |
| 1568 | mdk_rdev_t *rdev; | 1568 | mdk_rdev_t *rdev; |
| @@ -1585,7 +1585,7 @@ static void raid1d(mddev_t *mddev) | |||
| 1585 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1585 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 1586 | 1586 | ||
| 1587 | mddev = r1_bio->mddev; | 1587 | mddev = r1_bio->mddev; |
| 1588 | conf = mddev_to_conf(mddev); | 1588 | conf = mddev->private; |
| 1589 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { | 1589 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { |
| 1590 | sync_request_write(mddev, r1_bio); | 1590 | sync_request_write(mddev, r1_bio); |
| 1591 | unplug = 1; | 1591 | unplug = 1; |
| @@ -1706,7 +1706,7 @@ static int init_resync(conf_t *conf) | |||
| 1706 | 1706 | ||
| 1707 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) | 1707 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) |
| 1708 | { | 1708 | { |
| 1709 | conf_t *conf = mddev_to_conf(mddev); | 1709 | conf_t *conf = mddev->private; |
| 1710 | r1bio_t *r1_bio; | 1710 | r1bio_t *r1_bio; |
| 1711 | struct bio *bio; | 1711 | struct bio *bio; |
| 1712 | sector_t max_sector, nr_sectors; | 1712 | sector_t max_sector, nr_sectors; |
| @@ -1996,7 +1996,7 @@ static int run(mddev_t *mddev) | |||
| 1996 | * a one page request is never in violation. | 1996 | * a one page request is never in violation. |
| 1997 | */ | 1997 | */ |
| 1998 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 1998 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && |
| 1999 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 1999 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
| 2000 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 2000 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
| 2001 | 2001 | ||
| 2002 | disk->head_position = 0; | 2002 | disk->head_position = 0; |
| @@ -2052,6 +2052,10 @@ static int run(mddev_t *mddev) | |||
| 2052 | goto out_free_conf; | 2052 | goto out_free_conf; |
| 2053 | } | 2053 | } |
| 2054 | 2054 | ||
| 2055 | if (mddev->recovery_cp != MaxSector) | ||
| 2056 | printk(KERN_NOTICE "raid1: %s is not clean" | ||
| 2057 | " -- starting background reconstruction\n", | ||
| 2058 | mdname(mddev)); | ||
| 2055 | printk(KERN_INFO | 2059 | printk(KERN_INFO |
| 2056 | "raid1: raid set %s active with %d out of %d mirrors\n", | 2060 | "raid1: raid set %s active with %d out of %d mirrors\n", |
| 2057 | mdname(mddev), mddev->raid_disks - mddev->degraded, | 2061 | mdname(mddev), mddev->raid_disks - mddev->degraded, |
| @@ -2087,7 +2091,7 @@ out: | |||
| 2087 | 2091 | ||
| 2088 | static int stop(mddev_t *mddev) | 2092 | static int stop(mddev_t *mddev) |
| 2089 | { | 2093 | { |
| 2090 | conf_t *conf = mddev_to_conf(mddev); | 2094 | conf_t *conf = mddev->private; |
| 2091 | struct bitmap *bitmap = mddev->bitmap; | 2095 | struct bitmap *bitmap = mddev->bitmap; |
| 2092 | int behind_wait = 0; | 2096 | int behind_wait = 0; |
| 2093 | 2097 | ||
| @@ -2155,16 +2159,16 @@ static int raid1_reshape(mddev_t *mddev) | |||
| 2155 | mempool_t *newpool, *oldpool; | 2159 | mempool_t *newpool, *oldpool; |
| 2156 | struct pool_info *newpoolinfo; | 2160 | struct pool_info *newpoolinfo; |
| 2157 | mirror_info_t *newmirrors; | 2161 | mirror_info_t *newmirrors; |
| 2158 | conf_t *conf = mddev_to_conf(mddev); | 2162 | conf_t *conf = mddev->private; |
| 2159 | int cnt, raid_disks; | 2163 | int cnt, raid_disks; |
| 2160 | unsigned long flags; | 2164 | unsigned long flags; |
| 2161 | int d, d2, err; | 2165 | int d, d2, err; |
| 2162 | 2166 | ||
| 2163 | /* Cannot change chunk_size, layout, or level */ | 2167 | /* Cannot change chunk_size, layout, or level */ |
| 2164 | if (mddev->chunk_size != mddev->new_chunk || | 2168 | if (mddev->chunk_sectors != mddev->new_chunk_sectors || |
| 2165 | mddev->layout != mddev->new_layout || | 2169 | mddev->layout != mddev->new_layout || |
| 2166 | mddev->level != mddev->new_level) { | 2170 | mddev->level != mddev->new_level) { |
| 2167 | mddev->new_chunk = mddev->chunk_size; | 2171 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
| 2168 | mddev->new_layout = mddev->layout; | 2172 | mddev->new_layout = mddev->layout; |
| 2169 | mddev->new_level = mddev->level; | 2173 | mddev->new_level = mddev->level; |
| 2170 | return -EINVAL; | 2174 | return -EINVAL; |
| @@ -2252,7 +2256,7 @@ static int raid1_reshape(mddev_t *mddev) | |||
| 2252 | 2256 | ||
| 2253 | static void raid1_quiesce(mddev_t *mddev, int state) | 2257 | static void raid1_quiesce(mddev_t *mddev, int state) |
| 2254 | { | 2258 | { |
| 2255 | conf_t *conf = mddev_to_conf(mddev); | 2259 | conf_t *conf = mddev->private; |
| 2256 | 2260 | ||
| 2257 | switch(state) { | 2261 | switch(state) { |
| 2258 | case 1: | 2262 | case 1: |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 1620eea3d57c..e87b84deff68 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
| @@ -64,12 +64,6 @@ struct r1_private_data_s { | |||
| 64 | typedef struct r1_private_data_s conf_t; | 64 | typedef struct r1_private_data_s conf_t; |
| 65 | 65 | ||
| 66 | /* | 66 | /* |
| 67 | * this is the only point in the RAID code where we violate | ||
| 68 | * C type safety. mddev->private is an 'opaque' pointer. | ||
| 69 | */ | ||
| 70 | #define mddev_to_conf(mddev) ((conf_t *) mddev->private) | ||
| 71 | |||
| 72 | /* | ||
| 73 | * this is our 'private' RAID1 bio. | 67 | * this is our 'private' RAID1 bio. |
| 74 | * | 68 | * |
| 75 | * it contains information about what kind of IO operations were started | 69 | * it contains information about what kind of IO operations were started |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 499620afb44b..ae12ceafe10c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -188,7 +188,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) | |||
| 188 | 188 | ||
| 189 | static void free_r10bio(r10bio_t *r10_bio) | 189 | static void free_r10bio(r10bio_t *r10_bio) |
| 190 | { | 190 | { |
| 191 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | 191 | conf_t *conf = r10_bio->mddev->private; |
| 192 | 192 | ||
| 193 | /* | 193 | /* |
| 194 | * Wake up any possible resync thread that waits for the device | 194 | * Wake up any possible resync thread that waits for the device |
| @@ -202,7 +202,7 @@ static void free_r10bio(r10bio_t *r10_bio) | |||
| 202 | 202 | ||
| 203 | static void put_buf(r10bio_t *r10_bio) | 203 | static void put_buf(r10bio_t *r10_bio) |
| 204 | { | 204 | { |
| 205 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | 205 | conf_t *conf = r10_bio->mddev->private; |
| 206 | 206 | ||
| 207 | mempool_free(r10_bio, conf->r10buf_pool); | 207 | mempool_free(r10_bio, conf->r10buf_pool); |
| 208 | 208 | ||
| @@ -213,7 +213,7 @@ static void reschedule_retry(r10bio_t *r10_bio) | |||
| 213 | { | 213 | { |
| 214 | unsigned long flags; | 214 | unsigned long flags; |
| 215 | mddev_t *mddev = r10_bio->mddev; | 215 | mddev_t *mddev = r10_bio->mddev; |
| 216 | conf_t *conf = mddev_to_conf(mddev); | 216 | conf_t *conf = mddev->private; |
| 217 | 217 | ||
| 218 | spin_lock_irqsave(&conf->device_lock, flags); | 218 | spin_lock_irqsave(&conf->device_lock, flags); |
| 219 | list_add(&r10_bio->retry_list, &conf->retry_list); | 219 | list_add(&r10_bio->retry_list, &conf->retry_list); |
| @@ -245,7 +245,7 @@ static void raid_end_bio_io(r10bio_t *r10_bio) | |||
| 245 | */ | 245 | */ |
| 246 | static inline void update_head_pos(int slot, r10bio_t *r10_bio) | 246 | static inline void update_head_pos(int slot, r10bio_t *r10_bio) |
| 247 | { | 247 | { |
| 248 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | 248 | conf_t *conf = r10_bio->mddev->private; |
| 249 | 249 | ||
| 250 | conf->mirrors[r10_bio->devs[slot].devnum].head_position = | 250 | conf->mirrors[r10_bio->devs[slot].devnum].head_position = |
| 251 | r10_bio->devs[slot].addr + (r10_bio->sectors); | 251 | r10_bio->devs[slot].addr + (r10_bio->sectors); |
| @@ -256,7 +256,7 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
| 256 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 256 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| 257 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | 257 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); |
| 258 | int slot, dev; | 258 | int slot, dev; |
| 259 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | 259 | conf_t *conf = r10_bio->mddev->private; |
| 260 | 260 | ||
| 261 | 261 | ||
| 262 | slot = r10_bio->read_slot; | 262 | slot = r10_bio->read_slot; |
| @@ -297,7 +297,7 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
| 297 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 297 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| 298 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | 298 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); |
| 299 | int slot, dev; | 299 | int slot, dev; |
| 300 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | 300 | conf_t *conf = r10_bio->mddev->private; |
| 301 | 301 | ||
| 302 | for (slot = 0; slot < conf->copies; slot++) | 302 | for (slot = 0; slot < conf->copies; slot++) |
| 303 | if (r10_bio->devs[slot].bio == bio) | 303 | if (r10_bio->devs[slot].bio == bio) |
| @@ -461,7 +461,7 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
| 461 | mddev_t *mddev = q->queuedata; | 461 | mddev_t *mddev = q->queuedata; |
| 462 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 462 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
| 463 | int max; | 463 | int max; |
| 464 | unsigned int chunk_sectors = mddev->chunk_size >> 9; | 464 | unsigned int chunk_sectors = mddev->chunk_sectors; |
| 465 | unsigned int bio_sectors = bvm->bi_size >> 9; | 465 | unsigned int bio_sectors = bvm->bi_size >> 9; |
| 466 | 466 | ||
| 467 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; | 467 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; |
| @@ -596,7 +596,7 @@ rb_out: | |||
| 596 | 596 | ||
| 597 | static void unplug_slaves(mddev_t *mddev) | 597 | static void unplug_slaves(mddev_t *mddev) |
| 598 | { | 598 | { |
| 599 | conf_t *conf = mddev_to_conf(mddev); | 599 | conf_t *conf = mddev->private; |
| 600 | int i; | 600 | int i; |
| 601 | 601 | ||
| 602 | rcu_read_lock(); | 602 | rcu_read_lock(); |
| @@ -628,7 +628,7 @@ static void raid10_unplug(struct request_queue *q) | |||
| 628 | static int raid10_congested(void *data, int bits) | 628 | static int raid10_congested(void *data, int bits) |
| 629 | { | 629 | { |
| 630 | mddev_t *mddev = data; | 630 | mddev_t *mddev = data; |
| 631 | conf_t *conf = mddev_to_conf(mddev); | 631 | conf_t *conf = mddev->private; |
| 632 | int i, ret = 0; | 632 | int i, ret = 0; |
| 633 | 633 | ||
| 634 | rcu_read_lock(); | 634 | rcu_read_lock(); |
| @@ -788,7 +788,7 @@ static void unfreeze_array(conf_t *conf) | |||
| 788 | static int make_request(struct request_queue *q, struct bio * bio) | 788 | static int make_request(struct request_queue *q, struct bio * bio) |
| 789 | { | 789 | { |
| 790 | mddev_t *mddev = q->queuedata; | 790 | mddev_t *mddev = q->queuedata; |
| 791 | conf_t *conf = mddev_to_conf(mddev); | 791 | conf_t *conf = mddev->private; |
| 792 | mirror_info_t *mirror; | 792 | mirror_info_t *mirror; |
| 793 | r10bio_t *r10_bio; | 793 | r10bio_t *r10_bio; |
| 794 | struct bio *read_bio; | 794 | struct bio *read_bio; |
| @@ -981,11 +981,11 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
| 981 | 981 | ||
| 982 | static void status(struct seq_file *seq, mddev_t *mddev) | 982 | static void status(struct seq_file *seq, mddev_t *mddev) |
| 983 | { | 983 | { |
| 984 | conf_t *conf = mddev_to_conf(mddev); | 984 | conf_t *conf = mddev->private; |
| 985 | int i; | 985 | int i; |
| 986 | 986 | ||
| 987 | if (conf->near_copies < conf->raid_disks) | 987 | if (conf->near_copies < conf->raid_disks) |
| 988 | seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); | 988 | seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); |
| 989 | if (conf->near_copies > 1) | 989 | if (conf->near_copies > 1) |
| 990 | seq_printf(seq, " %d near-copies", conf->near_copies); | 990 | seq_printf(seq, " %d near-copies", conf->near_copies); |
| 991 | if (conf->far_copies > 1) { | 991 | if (conf->far_copies > 1) { |
| @@ -1006,7 +1006,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) | |||
| 1006 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | 1006 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) |
| 1007 | { | 1007 | { |
| 1008 | char b[BDEVNAME_SIZE]; | 1008 | char b[BDEVNAME_SIZE]; |
| 1009 | conf_t *conf = mddev_to_conf(mddev); | 1009 | conf_t *conf = mddev->private; |
| 1010 | 1010 | ||
| 1011 | /* | 1011 | /* |
| 1012 | * If it is not operational, then we have already marked it as dead | 1012 | * If it is not operational, then we have already marked it as dead |
| @@ -1158,8 +1158,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1158 | * a one page request is never in violation. | 1158 | * a one page request is never in violation. |
| 1159 | */ | 1159 | */ |
| 1160 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 1160 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && |
| 1161 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 1161 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
| 1162 | mddev->queue->max_sectors = (PAGE_SIZE>>9); | 1162 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
| 1163 | 1163 | ||
| 1164 | p->head_position = 0; | 1164 | p->head_position = 0; |
| 1165 | rdev->raid_disk = mirror; | 1165 | rdev->raid_disk = mirror; |
| @@ -1215,7 +1215,7 @@ abort: | |||
| 1215 | static void end_sync_read(struct bio *bio, int error) | 1215 | static void end_sync_read(struct bio *bio, int error) |
| 1216 | { | 1216 | { |
| 1217 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | 1217 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); |
| 1218 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | 1218 | conf_t *conf = r10_bio->mddev->private; |
| 1219 | int i,d; | 1219 | int i,d; |
| 1220 | 1220 | ||
| 1221 | for (i=0; i<conf->copies; i++) | 1221 | for (i=0; i<conf->copies; i++) |
| @@ -1253,7 +1253,7 @@ static void end_sync_write(struct bio *bio, int error) | |||
| 1253 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 1253 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| 1254 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | 1254 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); |
| 1255 | mddev_t *mddev = r10_bio->mddev; | 1255 | mddev_t *mddev = r10_bio->mddev; |
| 1256 | conf_t *conf = mddev_to_conf(mddev); | 1256 | conf_t *conf = mddev->private; |
| 1257 | int i,d; | 1257 | int i,d; |
| 1258 | 1258 | ||
| 1259 | for (i = 0; i < conf->copies; i++) | 1259 | for (i = 0; i < conf->copies; i++) |
| @@ -1300,7 +1300,7 @@ static void end_sync_write(struct bio *bio, int error) | |||
| 1300 | */ | 1300 | */ |
| 1301 | static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) | 1301 | static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) |
| 1302 | { | 1302 | { |
| 1303 | conf_t *conf = mddev_to_conf(mddev); | 1303 | conf_t *conf = mddev->private; |
| 1304 | int i, first; | 1304 | int i, first; |
| 1305 | struct bio *tbio, *fbio; | 1305 | struct bio *tbio, *fbio; |
| 1306 | 1306 | ||
| @@ -1400,7 +1400,7 @@ done: | |||
| 1400 | 1400 | ||
| 1401 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) | 1401 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) |
| 1402 | { | 1402 | { |
| 1403 | conf_t *conf = mddev_to_conf(mddev); | 1403 | conf_t *conf = mddev->private; |
| 1404 | int i, d; | 1404 | int i, d; |
| 1405 | struct bio *bio, *wbio; | 1405 | struct bio *bio, *wbio; |
| 1406 | 1406 | ||
| @@ -1549,7 +1549,7 @@ static void raid10d(mddev_t *mddev) | |||
| 1549 | r10bio_t *r10_bio; | 1549 | r10bio_t *r10_bio; |
| 1550 | struct bio *bio; | 1550 | struct bio *bio; |
| 1551 | unsigned long flags; | 1551 | unsigned long flags; |
| 1552 | conf_t *conf = mddev_to_conf(mddev); | 1552 | conf_t *conf = mddev->private; |
| 1553 | struct list_head *head = &conf->retry_list; | 1553 | struct list_head *head = &conf->retry_list; |
| 1554 | int unplug=0; | 1554 | int unplug=0; |
| 1555 | mdk_rdev_t *rdev; | 1555 | mdk_rdev_t *rdev; |
| @@ -1572,7 +1572,7 @@ static void raid10d(mddev_t *mddev) | |||
| 1572 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1572 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 1573 | 1573 | ||
| 1574 | mddev = r10_bio->mddev; | 1574 | mddev = r10_bio->mddev; |
| 1575 | conf = mddev_to_conf(mddev); | 1575 | conf = mddev->private; |
| 1576 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) { | 1576 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) { |
| 1577 | sync_request_write(mddev, r10_bio); | 1577 | sync_request_write(mddev, r10_bio); |
| 1578 | unplug = 1; | 1578 | unplug = 1; |
| @@ -1680,7 +1680,7 @@ static int init_resync(conf_t *conf) | |||
| 1680 | 1680 | ||
| 1681 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) | 1681 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) |
| 1682 | { | 1682 | { |
| 1683 | conf_t *conf = mddev_to_conf(mddev); | 1683 | conf_t *conf = mddev->private; |
| 1684 | r10bio_t *r10_bio; | 1684 | r10bio_t *r10_bio; |
| 1685 | struct bio *biolist = NULL, *bio; | 1685 | struct bio *biolist = NULL, *bio; |
| 1686 | sector_t max_sector, nr_sectors; | 1686 | sector_t max_sector, nr_sectors; |
| @@ -2026,7 +2026,7 @@ static sector_t | |||
| 2026 | raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) | 2026 | raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) |
| 2027 | { | 2027 | { |
| 2028 | sector_t size; | 2028 | sector_t size; |
| 2029 | conf_t *conf = mddev_to_conf(mddev); | 2029 | conf_t *conf = mddev->private; |
| 2030 | 2030 | ||
| 2031 | if (!raid_disks) | 2031 | if (!raid_disks) |
| 2032 | raid_disks = mddev->raid_disks; | 2032 | raid_disks = mddev->raid_disks; |
| @@ -2050,9 +2050,10 @@ static int run(mddev_t *mddev) | |||
| 2050 | int nc, fc, fo; | 2050 | int nc, fc, fo; |
| 2051 | sector_t stride, size; | 2051 | sector_t stride, size; |
| 2052 | 2052 | ||
| 2053 | if (mddev->chunk_size < PAGE_SIZE) { | 2053 | if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || |
| 2054 | !is_power_of_2(mddev->chunk_sectors)) { | ||
| 2054 | printk(KERN_ERR "md/raid10: chunk size must be " | 2055 | printk(KERN_ERR "md/raid10: chunk size must be " |
| 2055 | "at least PAGE_SIZE(%ld).\n", PAGE_SIZE); | 2056 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); |
| 2056 | return -EINVAL; | 2057 | return -EINVAL; |
| 2057 | } | 2058 | } |
| 2058 | 2059 | ||
| @@ -2095,8 +2096,8 @@ static int run(mddev_t *mddev) | |||
| 2095 | conf->far_copies = fc; | 2096 | conf->far_copies = fc; |
| 2096 | conf->copies = nc*fc; | 2097 | conf->copies = nc*fc; |
| 2097 | conf->far_offset = fo; | 2098 | conf->far_offset = fo; |
| 2098 | conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; | 2099 | conf->chunk_mask = mddev->chunk_sectors - 1; |
| 2099 | conf->chunk_shift = ffz(~mddev->chunk_size) - 9; | 2100 | conf->chunk_shift = ffz(~mddev->chunk_sectors); |
| 2100 | size = mddev->dev_sectors >> conf->chunk_shift; | 2101 | size = mddev->dev_sectors >> conf->chunk_shift; |
| 2101 | sector_div(size, fc); | 2102 | sector_div(size, fc); |
| 2102 | size = size * conf->raid_disks; | 2103 | size = size * conf->raid_disks; |
| @@ -2145,8 +2146,8 @@ static int run(mddev_t *mddev) | |||
| 2145 | * a one page request is never in violation. | 2146 | * a one page request is never in violation. |
| 2146 | */ | 2147 | */ |
| 2147 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 2148 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && |
| 2148 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 2149 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) |
| 2149 | mddev->queue->max_sectors = (PAGE_SIZE>>9); | 2150 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
| 2150 | 2151 | ||
| 2151 | disk->head_position = 0; | 2152 | disk->head_position = 0; |
| 2152 | } | 2153 | } |
| @@ -2185,6 +2186,10 @@ static int run(mddev_t *mddev) | |||
| 2185 | goto out_free_conf; | 2186 | goto out_free_conf; |
| 2186 | } | 2187 | } |
| 2187 | 2188 | ||
| 2189 | if (mddev->recovery_cp != MaxSector) | ||
| 2190 | printk(KERN_NOTICE "raid10: %s is not clean" | ||
| 2191 | " -- starting background reconstruction\n", | ||
| 2192 | mdname(mddev)); | ||
| 2188 | printk(KERN_INFO | 2193 | printk(KERN_INFO |
| 2189 | "raid10: raid set %s active with %d out of %d devices\n", | 2194 | "raid10: raid set %s active with %d out of %d devices\n", |
| 2190 | mdname(mddev), mddev->raid_disks - mddev->degraded, | 2195 | mdname(mddev), mddev->raid_disks - mddev->degraded, |
| @@ -2204,7 +2209,8 @@ static int run(mddev_t *mddev) | |||
| 2204 | * maybe... | 2209 | * maybe... |
| 2205 | */ | 2210 | */ |
| 2206 | { | 2211 | { |
| 2207 | int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE); | 2212 | int stripe = conf->raid_disks * |
| 2213 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); | ||
| 2208 | stripe /= conf->near_copies; | 2214 | stripe /= conf->near_copies; |
| 2209 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) | 2215 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) |
| 2210 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; | 2216 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; |
| @@ -2227,7 +2233,7 @@ out: | |||
| 2227 | 2233 | ||
| 2228 | static int stop(mddev_t *mddev) | 2234 | static int stop(mddev_t *mddev) |
| 2229 | { | 2235 | { |
| 2230 | conf_t *conf = mddev_to_conf(mddev); | 2236 | conf_t *conf = mddev->private; |
| 2231 | 2237 | ||
| 2232 | raise_barrier(conf, 0); | 2238 | raise_barrier(conf, 0); |
| 2233 | lower_barrier(conf); | 2239 | lower_barrier(conf); |
| @@ -2245,7 +2251,7 @@ static int stop(mddev_t *mddev) | |||
| 2245 | 2251 | ||
| 2246 | static void raid10_quiesce(mddev_t *mddev, int state) | 2252 | static void raid10_quiesce(mddev_t *mddev, int state) |
| 2247 | { | 2253 | { |
| 2248 | conf_t *conf = mddev_to_conf(mddev); | 2254 | conf_t *conf = mddev->private; |
| 2249 | 2255 | ||
| 2250 | switch(state) { | 2256 | switch(state) { |
| 2251 | case 1: | 2257 | case 1: |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 244dbe507a54..59cd1efb8d30 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
| @@ -62,12 +62,6 @@ struct r10_private_data_s { | |||
| 62 | typedef struct r10_private_data_s conf_t; | 62 | typedef struct r10_private_data_s conf_t; |
| 63 | 63 | ||
| 64 | /* | 64 | /* |
| 65 | * this is the only point in the RAID code where we violate | ||
| 66 | * C type safety. mddev->private is an 'opaque' pointer. | ||
| 67 | */ | ||
| 68 | #define mddev_to_conf(mddev) ((conf_t *) mddev->private) | ||
| 69 | |||
| 70 | /* | ||
| 71 | * this is our 'private' RAID10 bio. | 65 | * this is our 'private' RAID10 bio. |
| 72 | * | 66 | * |
| 73 | * it contains information about what kind of IO operations were started | 67 | * it contains information about what kind of IO operations were started |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 54ef8d75541d..cac6f4d3a143 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -1617,8 +1617,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
| 1617 | sector_t new_sector; | 1617 | sector_t new_sector; |
| 1618 | int algorithm = previous ? conf->prev_algo | 1618 | int algorithm = previous ? conf->prev_algo |
| 1619 | : conf->algorithm; | 1619 | : conf->algorithm; |
| 1620 | int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) | 1620 | int sectors_per_chunk = previous ? conf->prev_chunk_sectors |
| 1621 | : (conf->chunk_size >> 9); | 1621 | : conf->chunk_sectors; |
| 1622 | int raid_disks = previous ? conf->previous_raid_disks | 1622 | int raid_disks = previous ? conf->previous_raid_disks |
| 1623 | : conf->raid_disks; | 1623 | : conf->raid_disks; |
| 1624 | int data_disks = raid_disks - conf->max_degraded; | 1624 | int data_disks = raid_disks - conf->max_degraded; |
| @@ -1823,8 +1823,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
| 1823 | int raid_disks = sh->disks; | 1823 | int raid_disks = sh->disks; |
| 1824 | int data_disks = raid_disks - conf->max_degraded; | 1824 | int data_disks = raid_disks - conf->max_degraded; |
| 1825 | sector_t new_sector = sh->sector, check; | 1825 | sector_t new_sector = sh->sector, check; |
| 1826 | int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) | 1826 | int sectors_per_chunk = previous ? conf->prev_chunk_sectors |
| 1827 | : (conf->chunk_size >> 9); | 1827 | : conf->chunk_sectors; |
| 1828 | int algorithm = previous ? conf->prev_algo | 1828 | int algorithm = previous ? conf->prev_algo |
| 1829 | : conf->algorithm; | 1829 | : conf->algorithm; |
| 1830 | sector_t stripe; | 1830 | sector_t stripe; |
| @@ -2098,8 +2098,7 @@ static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, | |||
| 2098 | struct stripe_head *sh) | 2098 | struct stripe_head *sh) |
| 2099 | { | 2099 | { |
| 2100 | int sectors_per_chunk = | 2100 | int sectors_per_chunk = |
| 2101 | previous ? (conf->prev_chunk >> 9) | 2101 | previous ? conf->prev_chunk_sectors : conf->chunk_sectors; |
| 2102 | : (conf->chunk_size >> 9); | ||
| 2103 | int dd_idx; | 2102 | int dd_idx; |
| 2104 | int chunk_offset = sector_div(stripe, sectors_per_chunk); | 2103 | int chunk_offset = sector_div(stripe, sectors_per_chunk); |
| 2105 | int disks = previous ? conf->previous_raid_disks : conf->raid_disks; | 2104 | int disks = previous ? conf->previous_raid_disks : conf->raid_disks; |
| @@ -3496,7 +3495,7 @@ static void activate_bit_delay(raid5_conf_t *conf) | |||
| 3496 | 3495 | ||
| 3497 | static void unplug_slaves(mddev_t *mddev) | 3496 | static void unplug_slaves(mddev_t *mddev) |
| 3498 | { | 3497 | { |
| 3499 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3498 | raid5_conf_t *conf = mddev->private; |
| 3500 | int i; | 3499 | int i; |
| 3501 | 3500 | ||
| 3502 | rcu_read_lock(); | 3501 | rcu_read_lock(); |
| @@ -3520,7 +3519,7 @@ static void unplug_slaves(mddev_t *mddev) | |||
| 3520 | static void raid5_unplug_device(struct request_queue *q) | 3519 | static void raid5_unplug_device(struct request_queue *q) |
| 3521 | { | 3520 | { |
| 3522 | mddev_t *mddev = q->queuedata; | 3521 | mddev_t *mddev = q->queuedata; |
| 3523 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3522 | raid5_conf_t *conf = mddev->private; |
| 3524 | unsigned long flags; | 3523 | unsigned long flags; |
| 3525 | 3524 | ||
| 3526 | spin_lock_irqsave(&conf->device_lock, flags); | 3525 | spin_lock_irqsave(&conf->device_lock, flags); |
| @@ -3539,7 +3538,7 @@ static void raid5_unplug_device(struct request_queue *q) | |||
| 3539 | static int raid5_congested(void *data, int bits) | 3538 | static int raid5_congested(void *data, int bits) |
| 3540 | { | 3539 | { |
| 3541 | mddev_t *mddev = data; | 3540 | mddev_t *mddev = data; |
| 3542 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3541 | raid5_conf_t *conf = mddev->private; |
| 3543 | 3542 | ||
| 3544 | /* No difference between reads and writes. Just check | 3543 | /* No difference between reads and writes. Just check |
| 3545 | * how busy the stripe_cache is | 3544 | * how busy the stripe_cache is |
| @@ -3564,14 +3563,14 @@ static int raid5_mergeable_bvec(struct request_queue *q, | |||
| 3564 | mddev_t *mddev = q->queuedata; | 3563 | mddev_t *mddev = q->queuedata; |
| 3565 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 3564 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
| 3566 | int max; | 3565 | int max; |
| 3567 | unsigned int chunk_sectors = mddev->chunk_size >> 9; | 3566 | unsigned int chunk_sectors = mddev->chunk_sectors; |
| 3568 | unsigned int bio_sectors = bvm->bi_size >> 9; | 3567 | unsigned int bio_sectors = bvm->bi_size >> 9; |
| 3569 | 3568 | ||
| 3570 | if ((bvm->bi_rw & 1) == WRITE) | 3569 | if ((bvm->bi_rw & 1) == WRITE) |
| 3571 | return biovec->bv_len; /* always allow writes to be mergeable */ | 3570 | return biovec->bv_len; /* always allow writes to be mergeable */ |
| 3572 | 3571 | ||
| 3573 | if (mddev->new_chunk < mddev->chunk_size) | 3572 | if (mddev->new_chunk_sectors < mddev->chunk_sectors) |
| 3574 | chunk_sectors = mddev->new_chunk >> 9; | 3573 | chunk_sectors = mddev->new_chunk_sectors; |
| 3575 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; | 3574 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; |
| 3576 | if (max < 0) max = 0; | 3575 | if (max < 0) max = 0; |
| 3577 | if (max <= biovec->bv_len && bio_sectors == 0) | 3576 | if (max <= biovec->bv_len && bio_sectors == 0) |
| @@ -3584,11 +3583,11 @@ static int raid5_mergeable_bvec(struct request_queue *q, | |||
| 3584 | static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) | 3583 | static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) |
| 3585 | { | 3584 | { |
| 3586 | sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); | 3585 | sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); |
| 3587 | unsigned int chunk_sectors = mddev->chunk_size >> 9; | 3586 | unsigned int chunk_sectors = mddev->chunk_sectors; |
| 3588 | unsigned int bio_sectors = bio->bi_size >> 9; | 3587 | unsigned int bio_sectors = bio->bi_size >> 9; |
| 3589 | 3588 | ||
| 3590 | if (mddev->new_chunk < mddev->chunk_size) | 3589 | if (mddev->new_chunk_sectors < mddev->chunk_sectors) |
| 3591 | chunk_sectors = mddev->new_chunk >> 9; | 3590 | chunk_sectors = mddev->new_chunk_sectors; |
| 3592 | return chunk_sectors >= | 3591 | return chunk_sectors >= |
| 3593 | ((sector & (chunk_sectors - 1)) + bio_sectors); | 3592 | ((sector & (chunk_sectors - 1)) + bio_sectors); |
| 3594 | } | 3593 | } |
| @@ -3652,7 +3651,7 @@ static void raid5_align_endio(struct bio *bi, int error) | |||
| 3652 | bio_put(bi); | 3651 | bio_put(bi); |
| 3653 | 3652 | ||
| 3654 | mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; | 3653 | mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; |
| 3655 | conf = mddev_to_conf(mddev); | 3654 | conf = mddev->private; |
| 3656 | rdev = (void*)raid_bi->bi_next; | 3655 | rdev = (void*)raid_bi->bi_next; |
| 3657 | raid_bi->bi_next = NULL; | 3656 | raid_bi->bi_next = NULL; |
| 3658 | 3657 | ||
| @@ -3675,10 +3674,10 @@ static int bio_fits_rdev(struct bio *bi) | |||
| 3675 | { | 3674 | { |
| 3676 | struct request_queue *q = bdev_get_queue(bi->bi_bdev); | 3675 | struct request_queue *q = bdev_get_queue(bi->bi_bdev); |
| 3677 | 3676 | ||
| 3678 | if ((bi->bi_size>>9) > q->max_sectors) | 3677 | if ((bi->bi_size>>9) > queue_max_sectors(q)) |
| 3679 | return 0; | 3678 | return 0; |
| 3680 | blk_recount_segments(q, bi); | 3679 | blk_recount_segments(q, bi); |
| 3681 | if (bi->bi_phys_segments > q->max_phys_segments) | 3680 | if (bi->bi_phys_segments > queue_max_phys_segments(q)) |
| 3682 | return 0; | 3681 | return 0; |
| 3683 | 3682 | ||
| 3684 | if (q->merge_bvec_fn) | 3683 | if (q->merge_bvec_fn) |
| @@ -3694,7 +3693,7 @@ static int bio_fits_rdev(struct bio *bi) | |||
| 3694 | static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) | 3693 | static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) |
| 3695 | { | 3694 | { |
| 3696 | mddev_t *mddev = q->queuedata; | 3695 | mddev_t *mddev = q->queuedata; |
| 3697 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3696 | raid5_conf_t *conf = mddev->private; |
| 3698 | unsigned int dd_idx; | 3697 | unsigned int dd_idx; |
| 3699 | struct bio* align_bi; | 3698 | struct bio* align_bi; |
| 3700 | mdk_rdev_t *rdev; | 3699 | mdk_rdev_t *rdev; |
| @@ -3811,7 +3810,7 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) | |||
| 3811 | static int make_request(struct request_queue *q, struct bio * bi) | 3810 | static int make_request(struct request_queue *q, struct bio * bi) |
| 3812 | { | 3811 | { |
| 3813 | mddev_t *mddev = q->queuedata; | 3812 | mddev_t *mddev = q->queuedata; |
| 3814 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3813 | raid5_conf_t *conf = mddev->private; |
| 3815 | int dd_idx; | 3814 | int dd_idx; |
| 3816 | sector_t new_sector; | 3815 | sector_t new_sector; |
| 3817 | sector_t logical_sector, last_sector; | 3816 | sector_t logical_sector, last_sector; |
| @@ -3908,6 +3907,7 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
| 3908 | spin_unlock_irq(&conf->device_lock); | 3907 | spin_unlock_irq(&conf->device_lock); |
| 3909 | if (must_retry) { | 3908 | if (must_retry) { |
| 3910 | release_stripe(sh); | 3909 | release_stripe(sh); |
| 3910 | schedule(); | ||
| 3911 | goto retry; | 3911 | goto retry; |
| 3912 | } | 3912 | } |
| 3913 | } | 3913 | } |
| @@ -4003,10 +4003,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
| 4003 | * If old and new chunk sizes differ, we need to process the | 4003 | * If old and new chunk sizes differ, we need to process the |
| 4004 | * largest of these | 4004 | * largest of these |
| 4005 | */ | 4005 | */ |
| 4006 | if (mddev->new_chunk > mddev->chunk_size) | 4006 | if (mddev->new_chunk_sectors > mddev->chunk_sectors) |
| 4007 | reshape_sectors = mddev->new_chunk / 512; | 4007 | reshape_sectors = mddev->new_chunk_sectors; |
| 4008 | else | 4008 | else |
| 4009 | reshape_sectors = mddev->chunk_size / 512; | 4009 | reshape_sectors = mddev->chunk_sectors; |
| 4010 | 4010 | ||
| 4011 | /* we update the metadata when there is more than 3Meg | 4011 | /* we update the metadata when there is more than 3Meg |
| 4012 | * in the block range (that is rather arbitrary, should | 4012 | * in the block range (that is rather arbitrary, should |
| @@ -4129,7 +4129,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
| 4129 | 1, &dd_idx, NULL); | 4129 | 1, &dd_idx, NULL); |
| 4130 | last_sector = | 4130 | last_sector = |
| 4131 | raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) | 4131 | raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) |
| 4132 | *(new_data_disks) - 1), | 4132 | * new_data_disks - 1), |
| 4133 | 1, &dd_idx, NULL); | 4133 | 1, &dd_idx, NULL); |
| 4134 | if (last_sector >= mddev->dev_sectors) | 4134 | if (last_sector >= mddev->dev_sectors) |
| 4135 | last_sector = mddev->dev_sectors - 1; | 4135 | last_sector = mddev->dev_sectors - 1; |
| @@ -4158,7 +4158,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
| 4158 | wait_event(conf->wait_for_overlap, | 4158 | wait_event(conf->wait_for_overlap, |
| 4159 | atomic_read(&conf->reshape_stripes) == 0); | 4159 | atomic_read(&conf->reshape_stripes) == 0); |
| 4160 | mddev->reshape_position = conf->reshape_progress; | 4160 | mddev->reshape_position = conf->reshape_progress; |
| 4161 | mddev->curr_resync_completed = mddev->curr_resync; | 4161 | mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; |
| 4162 | conf->reshape_checkpoint = jiffies; | 4162 | conf->reshape_checkpoint = jiffies; |
| 4163 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4163 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
| 4164 | md_wakeup_thread(mddev->thread); | 4164 | md_wakeup_thread(mddev->thread); |
| @@ -4371,7 +4371,7 @@ static void synchronize_stripe_processing(struct list_head *domain) | |||
| 4371 | static void raid5d(mddev_t *mddev) | 4371 | static void raid5d(mddev_t *mddev) |
| 4372 | { | 4372 | { |
| 4373 | struct stripe_head *sh; | 4373 | struct stripe_head *sh; |
| 4374 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4374 | raid5_conf_t *conf = mddev->private; |
| 4375 | int handled; | 4375 | int handled; |
| 4376 | LIST_HEAD(raid_domain); | 4376 | LIST_HEAD(raid_domain); |
| 4377 | 4377 | ||
| @@ -4428,7 +4428,7 @@ static void raid5d(mddev_t *mddev) | |||
| 4428 | static ssize_t | 4428 | static ssize_t |
| 4429 | raid5_show_stripe_cache_size(mddev_t *mddev, char *page) | 4429 | raid5_show_stripe_cache_size(mddev_t *mddev, char *page) |
| 4430 | { | 4430 | { |
| 4431 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4431 | raid5_conf_t *conf = mddev->private; |
| 4432 | if (conf) | 4432 | if (conf) |
| 4433 | return sprintf(page, "%d\n", conf->max_nr_stripes); | 4433 | return sprintf(page, "%d\n", conf->max_nr_stripes); |
| 4434 | else | 4434 | else |
| @@ -4438,7 +4438,7 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page) | |||
| 4438 | static ssize_t | 4438 | static ssize_t |
| 4439 | raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) | 4439 | raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) |
| 4440 | { | 4440 | { |
| 4441 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4441 | raid5_conf_t *conf = mddev->private; |
| 4442 | unsigned long new; | 4442 | unsigned long new; |
| 4443 | int err; | 4443 | int err; |
| 4444 | 4444 | ||
| @@ -4476,7 +4476,7 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, | |||
| 4476 | static ssize_t | 4476 | static ssize_t |
| 4477 | raid5_show_preread_threshold(mddev_t *mddev, char *page) | 4477 | raid5_show_preread_threshold(mddev_t *mddev, char *page) |
| 4478 | { | 4478 | { |
| 4479 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4479 | raid5_conf_t *conf = mddev->private; |
| 4480 | if (conf) | 4480 | if (conf) |
| 4481 | return sprintf(page, "%d\n", conf->bypass_threshold); | 4481 | return sprintf(page, "%d\n", conf->bypass_threshold); |
| 4482 | else | 4482 | else |
| @@ -4486,7 +4486,7 @@ raid5_show_preread_threshold(mddev_t *mddev, char *page) | |||
| 4486 | static ssize_t | 4486 | static ssize_t |
| 4487 | raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) | 4487 | raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) |
| 4488 | { | 4488 | { |
| 4489 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4489 | raid5_conf_t *conf = mddev->private; |
| 4490 | unsigned long new; | 4490 | unsigned long new; |
| 4491 | if (len >= PAGE_SIZE) | 4491 | if (len >= PAGE_SIZE) |
| 4492 | return -EINVAL; | 4492 | return -EINVAL; |
| @@ -4510,7 +4510,7 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, | |||
| 4510 | static ssize_t | 4510 | static ssize_t |
| 4511 | stripe_cache_active_show(mddev_t *mddev, char *page) | 4511 | stripe_cache_active_show(mddev_t *mddev, char *page) |
| 4512 | { | 4512 | { |
| 4513 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4513 | raid5_conf_t *conf = mddev->private; |
| 4514 | if (conf) | 4514 | if (conf) |
| 4515 | return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); | 4515 | return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); |
| 4516 | else | 4516 | else |
| @@ -4534,7 +4534,7 @@ static struct attribute_group raid5_attrs_group = { | |||
| 4534 | static sector_t | 4534 | static sector_t |
| 4535 | raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) | 4535 | raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) |
| 4536 | { | 4536 | { |
| 4537 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4537 | raid5_conf_t *conf = mddev->private; |
| 4538 | 4538 | ||
| 4539 | if (!sectors) | 4539 | if (!sectors) |
| 4540 | sectors = mddev->dev_sectors; | 4540 | sectors = mddev->dev_sectors; |
| @@ -4546,8 +4546,8 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
| 4546 | raid_disks = conf->previous_raid_disks; | 4546 | raid_disks = conf->previous_raid_disks; |
| 4547 | } | 4547 | } |
| 4548 | 4548 | ||
| 4549 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); | 4549 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); |
| 4550 | sectors &= ~((sector_t)mddev->new_chunk/512 - 1); | 4550 | sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); |
| 4551 | return sectors * (raid_disks - conf->max_degraded); | 4551 | return sectors * (raid_disks - conf->max_degraded); |
| 4552 | } | 4552 | } |
| 4553 | 4553 | ||
| @@ -4691,9 +4691,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
| 4691 | return ERR_PTR(-EINVAL); | 4691 | return ERR_PTR(-EINVAL); |
| 4692 | } | 4692 | } |
| 4693 | 4693 | ||
| 4694 | if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) { | 4694 | if (!mddev->new_chunk_sectors || |
| 4695 | (mddev->new_chunk_sectors << 9) % PAGE_SIZE || | ||
| 4696 | !is_power_of_2(mddev->new_chunk_sectors)) { | ||
| 4695 | printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", | 4697 | printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", |
| 4696 | mddev->new_chunk, mdname(mddev)); | 4698 | mddev->new_chunk_sectors << 9, mdname(mddev)); |
| 4697 | return ERR_PTR(-EINVAL); | 4699 | return ERR_PTR(-EINVAL); |
| 4698 | } | 4700 | } |
| 4699 | 4701 | ||
| @@ -4756,7 +4758,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
| 4756 | conf->fullsync = 1; | 4758 | conf->fullsync = 1; |
| 4757 | } | 4759 | } |
| 4758 | 4760 | ||
| 4759 | conf->chunk_size = mddev->new_chunk; | 4761 | conf->chunk_sectors = mddev->new_chunk_sectors; |
| 4762 | conf->level = mddev->new_level; | ||
| 4760 | if (conf->level == 6) | 4763 | if (conf->level == 6) |
| 4761 | conf->max_degraded = 2; | 4764 | conf->max_degraded = 2; |
| 4762 | else | 4765 | else |
| @@ -4765,7 +4768,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
| 4765 | conf->max_nr_stripes = NR_STRIPES; | 4768 | conf->max_nr_stripes = NR_STRIPES; |
| 4766 | conf->reshape_progress = mddev->reshape_position; | 4769 | conf->reshape_progress = mddev->reshape_position; |
| 4767 | if (conf->reshape_progress != MaxSector) { | 4770 | if (conf->reshape_progress != MaxSector) { |
| 4768 | conf->prev_chunk = mddev->chunk_size; | 4771 | conf->prev_chunk_sectors = mddev->chunk_sectors; |
| 4769 | conf->prev_algo = mddev->layout; | 4772 | conf->prev_algo = mddev->layout; |
| 4770 | } | 4773 | } |
| 4771 | 4774 | ||
| @@ -4803,6 +4806,10 @@ static int run(mddev_t *mddev) | |||
| 4803 | int working_disks = 0; | 4806 | int working_disks = 0; |
| 4804 | mdk_rdev_t *rdev; | 4807 | mdk_rdev_t *rdev; |
| 4805 | 4808 | ||
| 4809 | if (mddev->recovery_cp != MaxSector) | ||
| 4810 | printk(KERN_NOTICE "raid5: %s is not clean" | ||
| 4811 | " -- starting background reconstruction\n", | ||
| 4812 | mdname(mddev)); | ||
| 4806 | if (mddev->reshape_position != MaxSector) { | 4813 | if (mddev->reshape_position != MaxSector) { |
| 4807 | /* Check that we can continue the reshape. | 4814 | /* Check that we can continue the reshape. |
| 4808 | * Currently only disks can change, it must | 4815 | * Currently only disks can change, it must |
| @@ -4825,7 +4832,7 @@ static int run(mddev_t *mddev) | |||
| 4825 | * geometry. | 4832 | * geometry. |
| 4826 | */ | 4833 | */ |
| 4827 | here_new = mddev->reshape_position; | 4834 | here_new = mddev->reshape_position; |
| 4828 | if (sector_div(here_new, (mddev->new_chunk>>9)* | 4835 | if (sector_div(here_new, mddev->new_chunk_sectors * |
| 4829 | (mddev->raid_disks - max_degraded))) { | 4836 | (mddev->raid_disks - max_degraded))) { |
| 4830 | printk(KERN_ERR "raid5: reshape_position not " | 4837 | printk(KERN_ERR "raid5: reshape_position not " |
| 4831 | "on a stripe boundary\n"); | 4838 | "on a stripe boundary\n"); |
| @@ -4833,7 +4840,7 @@ static int run(mddev_t *mddev) | |||
| 4833 | } | 4840 | } |
| 4834 | /* here_new is the stripe we will write to */ | 4841 | /* here_new is the stripe we will write to */ |
| 4835 | here_old = mddev->reshape_position; | 4842 | here_old = mddev->reshape_position; |
| 4836 | sector_div(here_old, (mddev->chunk_size>>9)* | 4843 | sector_div(here_old, mddev->chunk_sectors * |
| 4837 | (old_disks-max_degraded)); | 4844 | (old_disks-max_degraded)); |
| 4838 | /* here_old is the first stripe that we might need to read | 4845 | /* here_old is the first stripe that we might need to read |
| 4839 | * from */ | 4846 | * from */ |
| @@ -4848,7 +4855,7 @@ static int run(mddev_t *mddev) | |||
| 4848 | } else { | 4855 | } else { |
| 4849 | BUG_ON(mddev->level != mddev->new_level); | 4856 | BUG_ON(mddev->level != mddev->new_level); |
| 4850 | BUG_ON(mddev->layout != mddev->new_layout); | 4857 | BUG_ON(mddev->layout != mddev->new_layout); |
| 4851 | BUG_ON(mddev->chunk_size != mddev->new_chunk); | 4858 | BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); |
| 4852 | BUG_ON(mddev->delta_disks != 0); | 4859 | BUG_ON(mddev->delta_disks != 0); |
| 4853 | } | 4860 | } |
| 4854 | 4861 | ||
| @@ -4882,7 +4889,7 @@ static int run(mddev_t *mddev) | |||
| 4882 | } | 4889 | } |
| 4883 | 4890 | ||
| 4884 | /* device size must be a multiple of chunk size */ | 4891 | /* device size must be a multiple of chunk size */ |
| 4885 | mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); | 4892 | mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); |
| 4886 | mddev->resync_max_sectors = mddev->dev_sectors; | 4893 | mddev->resync_max_sectors = mddev->dev_sectors; |
| 4887 | 4894 | ||
| 4888 | if (mddev->degraded > 0 && | 4895 | if (mddev->degraded > 0 && |
| @@ -4931,7 +4938,7 @@ static int run(mddev_t *mddev) | |||
| 4931 | { | 4938 | { |
| 4932 | int data_disks = conf->previous_raid_disks - conf->max_degraded; | 4939 | int data_disks = conf->previous_raid_disks - conf->max_degraded; |
| 4933 | int stripe = data_disks * | 4940 | int stripe = data_disks * |
| 4934 | (mddev->chunk_size / PAGE_SIZE); | 4941 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); |
| 4935 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | 4942 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
| 4936 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | 4943 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
| 4937 | } | 4944 | } |
| @@ -5021,7 +5028,8 @@ static void status(struct seq_file *seq, mddev_t *mddev) | |||
| 5021 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | 5028 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; |
| 5022 | int i; | 5029 | int i; |
| 5023 | 5030 | ||
| 5024 | seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); | 5031 | seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, |
| 5032 | mddev->chunk_sectors / 2, mddev->layout); | ||
| 5025 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); | 5033 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); |
| 5026 | for (i = 0; i < conf->raid_disks; i++) | 5034 | for (i = 0; i < conf->raid_disks; i++) |
| 5027 | seq_printf (seq, "%s", | 5035 | seq_printf (seq, "%s", |
| @@ -5169,7 +5177,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
| 5169 | * any io in the removed space completes, but it hardly seems | 5177 | * any io in the removed space completes, but it hardly seems |
| 5170 | * worth it. | 5178 | * worth it. |
| 5171 | */ | 5179 | */ |
| 5172 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); | 5180 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); |
| 5173 | md_set_array_sectors(mddev, raid5_size(mddev, sectors, | 5181 | md_set_array_sectors(mddev, raid5_size(mddev, sectors, |
| 5174 | mddev->raid_disks)); | 5182 | mddev->raid_disks)); |
| 5175 | if (mddev->array_sectors > | 5183 | if (mddev->array_sectors > |
| @@ -5186,14 +5194,37 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
| 5186 | return 0; | 5194 | return 0; |
| 5187 | } | 5195 | } |
| 5188 | 5196 | ||
| 5189 | static int raid5_check_reshape(mddev_t *mddev) | 5197 | static int check_stripe_cache(mddev_t *mddev) |
| 5198 | { | ||
| 5199 | /* Can only proceed if there are plenty of stripe_heads. | ||
| 5200 | * We need a minimum of one full stripe,, and for sensible progress | ||
| 5201 | * it is best to have about 4 times that. | ||
| 5202 | * If we require 4 times, then the default 256 4K stripe_heads will | ||
| 5203 | * allow for chunk sizes up to 256K, which is probably OK. | ||
| 5204 | * If the chunk size is greater, user-space should request more | ||
| 5205 | * stripe_heads first. | ||
| 5206 | */ | ||
| 5207 | raid5_conf_t *conf = mddev->private; | ||
| 5208 | if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 | ||
| 5209 | > conf->max_nr_stripes || | ||
| 5210 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 | ||
| 5211 | > conf->max_nr_stripes) { | ||
| 5212 | printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", | ||
| 5213 | ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) | ||
| 5214 | / STRIPE_SIZE)*4); | ||
| 5215 | return 0; | ||
| 5216 | } | ||
| 5217 | return 1; | ||
| 5218 | } | ||
| 5219 | |||
| 5220 | static int check_reshape(mddev_t *mddev) | ||
| 5190 | { | 5221 | { |
| 5191 | raid5_conf_t *conf = mddev_to_conf(mddev); | 5222 | raid5_conf_t *conf = mddev->private; |
| 5192 | 5223 | ||
| 5193 | if (mddev->delta_disks == 0 && | 5224 | if (mddev->delta_disks == 0 && |
| 5194 | mddev->new_layout == mddev->layout && | 5225 | mddev->new_layout == mddev->layout && |
| 5195 | mddev->new_chunk == mddev->chunk_size) | 5226 | mddev->new_chunk_sectors == mddev->chunk_sectors) |
| 5196 | return -EINVAL; /* nothing to do */ | 5227 | return 0; /* nothing to do */ |
| 5197 | if (mddev->bitmap) | 5228 | if (mddev->bitmap) |
| 5198 | /* Cannot grow a bitmap yet */ | 5229 | /* Cannot grow a bitmap yet */ |
| 5199 | return -EBUSY; | 5230 | return -EBUSY; |
| @@ -5212,28 +5243,15 @@ static int raid5_check_reshape(mddev_t *mddev) | |||
| 5212 | return -EINVAL; | 5243 | return -EINVAL; |
| 5213 | } | 5244 | } |
| 5214 | 5245 | ||
| 5215 | /* Can only proceed if there are plenty of stripe_heads. | 5246 | if (!check_stripe_cache(mddev)) |
| 5216 | * We need a minimum of one full stripe,, and for sensible progress | ||
| 5217 | * it is best to have about 4 times that. | ||
| 5218 | * If we require 4 times, then the default 256 4K stripe_heads will | ||
| 5219 | * allow for chunk sizes up to 256K, which is probably OK. | ||
| 5220 | * If the chunk size is greater, user-space should request more | ||
| 5221 | * stripe_heads first. | ||
| 5222 | */ | ||
| 5223 | if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || | ||
| 5224 | (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { | ||
| 5225 | printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", | ||
| 5226 | (max(mddev->chunk_size, mddev->new_chunk) | ||
| 5227 | / STRIPE_SIZE)*4); | ||
| 5228 | return -ENOSPC; | 5247 | return -ENOSPC; |
| 5229 | } | ||
| 5230 | 5248 | ||
| 5231 | return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); | 5249 | return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); |
| 5232 | } | 5250 | } |
| 5233 | 5251 | ||
| 5234 | static int raid5_start_reshape(mddev_t *mddev) | 5252 | static int raid5_start_reshape(mddev_t *mddev) |
| 5235 | { | 5253 | { |
| 5236 | raid5_conf_t *conf = mddev_to_conf(mddev); | 5254 | raid5_conf_t *conf = mddev->private; |
| 5237 | mdk_rdev_t *rdev; | 5255 | mdk_rdev_t *rdev; |
| 5238 | int spares = 0; | 5256 | int spares = 0; |
| 5239 | int added_devices = 0; | 5257 | int added_devices = 0; |
| @@ -5242,6 +5260,9 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
| 5242 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 5260 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
| 5243 | return -EBUSY; | 5261 | return -EBUSY; |
| 5244 | 5262 | ||
| 5263 | if (!check_stripe_cache(mddev)) | ||
| 5264 | return -ENOSPC; | ||
| 5265 | |||
| 5245 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5266 | list_for_each_entry(rdev, &mddev->disks, same_set) |
| 5246 | if (rdev->raid_disk < 0 && | 5267 | if (rdev->raid_disk < 0 && |
| 5247 | !test_bit(Faulty, &rdev->flags)) | 5268 | !test_bit(Faulty, &rdev->flags)) |
| @@ -5268,8 +5289,8 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
| 5268 | spin_lock_irq(&conf->device_lock); | 5289 | spin_lock_irq(&conf->device_lock); |
| 5269 | conf->previous_raid_disks = conf->raid_disks; | 5290 | conf->previous_raid_disks = conf->raid_disks; |
| 5270 | conf->raid_disks += mddev->delta_disks; | 5291 | conf->raid_disks += mddev->delta_disks; |
| 5271 | conf->prev_chunk = conf->chunk_size; | 5292 | conf->prev_chunk_sectors = conf->chunk_sectors; |
| 5272 | conf->chunk_size = mddev->new_chunk; | 5293 | conf->chunk_sectors = mddev->new_chunk_sectors; |
| 5273 | conf->prev_algo = conf->algorithm; | 5294 | conf->prev_algo = conf->algorithm; |
| 5274 | conf->algorithm = mddev->new_layout; | 5295 | conf->algorithm = mddev->new_layout; |
| 5275 | if (mddev->delta_disks < 0) | 5296 | if (mddev->delta_disks < 0) |
| @@ -5351,7 +5372,7 @@ static void end_reshape(raid5_conf_t *conf) | |||
| 5351 | */ | 5372 | */ |
| 5352 | { | 5373 | { |
| 5353 | int data_disks = conf->raid_disks - conf->max_degraded; | 5374 | int data_disks = conf->raid_disks - conf->max_degraded; |
| 5354 | int stripe = data_disks * (conf->chunk_size | 5375 | int stripe = data_disks * ((conf->chunk_sectors << 9) |
| 5355 | / PAGE_SIZE); | 5376 | / PAGE_SIZE); |
| 5356 | if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | 5377 | if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
| 5357 | conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | 5378 | conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
| @@ -5365,7 +5386,7 @@ static void end_reshape(raid5_conf_t *conf) | |||
| 5365 | static void raid5_finish_reshape(mddev_t *mddev) | 5386 | static void raid5_finish_reshape(mddev_t *mddev) |
| 5366 | { | 5387 | { |
| 5367 | struct block_device *bdev; | 5388 | struct block_device *bdev; |
| 5368 | raid5_conf_t *conf = mddev_to_conf(mddev); | 5389 | raid5_conf_t *conf = mddev->private; |
| 5369 | 5390 | ||
| 5370 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | 5391 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
| 5371 | 5392 | ||
| @@ -5396,7 +5417,7 @@ static void raid5_finish_reshape(mddev_t *mddev) | |||
| 5396 | raid5_remove_disk(mddev, d); | 5417 | raid5_remove_disk(mddev, d); |
| 5397 | } | 5418 | } |
| 5398 | mddev->layout = conf->algorithm; | 5419 | mddev->layout = conf->algorithm; |
| 5399 | mddev->chunk_size = conf->chunk_size; | 5420 | mddev->chunk_sectors = conf->chunk_sectors; |
| 5400 | mddev->reshape_position = MaxSector; | 5421 | mddev->reshape_position = MaxSector; |
| 5401 | mddev->delta_disks = 0; | 5422 | mddev->delta_disks = 0; |
| 5402 | } | 5423 | } |
| @@ -5404,7 +5425,7 @@ static void raid5_finish_reshape(mddev_t *mddev) | |||
| 5404 | 5425 | ||
| 5405 | static void raid5_quiesce(mddev_t *mddev, int state) | 5426 | static void raid5_quiesce(mddev_t *mddev, int state) |
| 5406 | { | 5427 | { |
| 5407 | raid5_conf_t *conf = mddev_to_conf(mddev); | 5428 | raid5_conf_t *conf = mddev->private; |
| 5408 | 5429 | ||
| 5409 | switch(state) { | 5430 | switch(state) { |
| 5410 | case 2: /* resume for a suspend */ | 5431 | case 2: /* resume for a suspend */ |
| @@ -5454,7 +5475,7 @@ static void *raid5_takeover_raid1(mddev_t *mddev) | |||
| 5454 | 5475 | ||
| 5455 | mddev->new_level = 5; | 5476 | mddev->new_level = 5; |
| 5456 | mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; | 5477 | mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; |
| 5457 | mddev->new_chunk = chunksect << 9; | 5478 | mddev->new_chunk_sectors = chunksect; |
| 5458 | 5479 | ||
| 5459 | return setup_conf(mddev); | 5480 | return setup_conf(mddev); |
| 5460 | } | 5481 | } |
| @@ -5493,24 +5514,24 @@ static void *raid5_takeover_raid6(mddev_t *mddev) | |||
| 5493 | } | 5514 | } |
| 5494 | 5515 | ||
| 5495 | 5516 | ||
| 5496 | static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) | 5517 | static int raid5_check_reshape(mddev_t *mddev) |
| 5497 | { | 5518 | { |
| 5498 | /* For a 2-drive array, the layout and chunk size can be changed | 5519 | /* For a 2-drive array, the layout and chunk size can be changed |
| 5499 | * immediately as not restriping is needed. | 5520 | * immediately as not restriping is needed. |
| 5500 | * For larger arrays we record the new value - after validation | 5521 | * For larger arrays we record the new value - after validation |
| 5501 | * to be used by a reshape pass. | 5522 | * to be used by a reshape pass. |
| 5502 | */ | 5523 | */ |
| 5503 | raid5_conf_t *conf = mddev_to_conf(mddev); | 5524 | raid5_conf_t *conf = mddev->private; |
| 5525 | int new_chunk = mddev->new_chunk_sectors; | ||
| 5504 | 5526 | ||
| 5505 | if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) | 5527 | if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) |
| 5506 | return -EINVAL; | 5528 | return -EINVAL; |
| 5507 | if (new_chunk > 0) { | 5529 | if (new_chunk > 0) { |
| 5508 | if (new_chunk & (new_chunk-1)) | 5530 | if (!is_power_of_2(new_chunk)) |
| 5509 | /* not a power of 2 */ | ||
| 5510 | return -EINVAL; | 5531 | return -EINVAL; |
| 5511 | if (new_chunk < PAGE_SIZE) | 5532 | if (new_chunk < (PAGE_SIZE>>9)) |
| 5512 | return -EINVAL; | 5533 | return -EINVAL; |
| 5513 | if (mddev->array_sectors & ((new_chunk>>9)-1)) | 5534 | if (mddev->array_sectors & (new_chunk-1)) |
| 5514 | /* not factor of array size */ | 5535 | /* not factor of array size */ |
| 5515 | return -EINVAL; | 5536 | return -EINVAL; |
| 5516 | } | 5537 | } |
| @@ -5518,49 +5539,39 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) | |||
| 5518 | /* They look valid */ | 5539 | /* They look valid */ |
| 5519 | 5540 | ||
| 5520 | if (mddev->raid_disks == 2) { | 5541 | if (mddev->raid_disks == 2) { |
| 5521 | 5542 | /* can make the change immediately */ | |
| 5522 | if (new_layout >= 0) { | 5543 | if (mddev->new_layout >= 0) { |
| 5523 | conf->algorithm = new_layout; | 5544 | conf->algorithm = mddev->new_layout; |
| 5524 | mddev->layout = mddev->new_layout = new_layout; | 5545 | mddev->layout = mddev->new_layout; |
| 5525 | } | 5546 | } |
| 5526 | if (new_chunk > 0) { | 5547 | if (new_chunk > 0) { |
| 5527 | conf->chunk_size = new_chunk; | 5548 | conf->chunk_sectors = new_chunk ; |
| 5528 | mddev->chunk_size = mddev->new_chunk = new_chunk; | 5549 | mddev->chunk_sectors = new_chunk; |
| 5529 | } | 5550 | } |
| 5530 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 5551 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
| 5531 | md_wakeup_thread(mddev->thread); | 5552 | md_wakeup_thread(mddev->thread); |
| 5532 | } else { | ||
| 5533 | if (new_layout >= 0) | ||
| 5534 | mddev->new_layout = new_layout; | ||
| 5535 | if (new_chunk > 0) | ||
| 5536 | mddev->new_chunk = new_chunk; | ||
| 5537 | } | 5553 | } |
| 5538 | return 0; | 5554 | return check_reshape(mddev); |
| 5539 | } | 5555 | } |
| 5540 | 5556 | ||
| 5541 | static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk) | 5557 | static int raid6_check_reshape(mddev_t *mddev) |
| 5542 | { | 5558 | { |
| 5543 | if (new_layout >= 0 && !algorithm_valid_raid6(new_layout)) | 5559 | int new_chunk = mddev->new_chunk_sectors; |
| 5560 | |||
| 5561 | if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) | ||
| 5544 | return -EINVAL; | 5562 | return -EINVAL; |
| 5545 | if (new_chunk > 0) { | 5563 | if (new_chunk > 0) { |
| 5546 | if (new_chunk & (new_chunk-1)) | 5564 | if (!is_power_of_2(new_chunk)) |
| 5547 | /* not a power of 2 */ | ||
| 5548 | return -EINVAL; | 5565 | return -EINVAL; |
| 5549 | if (new_chunk < PAGE_SIZE) | 5566 | if (new_chunk < (PAGE_SIZE >> 9)) |
| 5550 | return -EINVAL; | 5567 | return -EINVAL; |
| 5551 | if (mddev->array_sectors & ((new_chunk>>9)-1)) | 5568 | if (mddev->array_sectors & (new_chunk-1)) |
| 5552 | /* not factor of array size */ | 5569 | /* not factor of array size */ |
| 5553 | return -EINVAL; | 5570 | return -EINVAL; |
| 5554 | } | 5571 | } |
| 5555 | 5572 | ||
| 5556 | /* They look valid */ | 5573 | /* They look valid */ |
| 5557 | 5574 | return check_reshape(mddev); | |
| 5558 | if (new_layout >= 0) | ||
| 5559 | mddev->new_layout = new_layout; | ||
| 5560 | if (new_chunk > 0) | ||
| 5561 | mddev->new_chunk = new_chunk; | ||
| 5562 | |||
| 5563 | return 0; | ||
| 5564 | } | 5575 | } |
| 5565 | 5576 | ||
| 5566 | static void *raid5_takeover(mddev_t *mddev) | 5577 | static void *raid5_takeover(mddev_t *mddev) |
| @@ -5570,8 +5581,6 @@ static void *raid5_takeover(mddev_t *mddev) | |||
| 5570 | * raid1 - if there are two drives. We need to know the chunk size | 5581 | * raid1 - if there are two drives. We need to know the chunk size |
| 5571 | * raid4 - trivial - just use a raid4 layout. | 5582 | * raid4 - trivial - just use a raid4 layout. |
| 5572 | * raid6 - Providing it is a *_6 layout | 5583 | * raid6 - Providing it is a *_6 layout |
| 5573 | * | ||
| 5574 | * For now, just do raid1 | ||
| 5575 | */ | 5584 | */ |
| 5576 | 5585 | ||
| 5577 | if (mddev->level == 1) | 5586 | if (mddev->level == 1) |
| @@ -5653,12 +5662,11 @@ static struct mdk_personality raid6_personality = | |||
| 5653 | .sync_request = sync_request, | 5662 | .sync_request = sync_request, |
| 5654 | .resize = raid5_resize, | 5663 | .resize = raid5_resize, |
| 5655 | .size = raid5_size, | 5664 | .size = raid5_size, |
| 5656 | .check_reshape = raid5_check_reshape, | 5665 | .check_reshape = raid6_check_reshape, |
| 5657 | .start_reshape = raid5_start_reshape, | 5666 | .start_reshape = raid5_start_reshape, |
| 5658 | .finish_reshape = raid5_finish_reshape, | 5667 | .finish_reshape = raid5_finish_reshape, |
| 5659 | .quiesce = raid5_quiesce, | 5668 | .quiesce = raid5_quiesce, |
| 5660 | .takeover = raid6_takeover, | 5669 | .takeover = raid6_takeover, |
| 5661 | .reconfig = raid6_reconfig, | ||
| 5662 | }; | 5670 | }; |
| 5663 | static struct mdk_personality raid5_personality = | 5671 | static struct mdk_personality raid5_personality = |
| 5664 | { | 5672 | { |
| @@ -5681,7 +5689,6 @@ static struct mdk_personality raid5_personality = | |||
| 5681 | .finish_reshape = raid5_finish_reshape, | 5689 | .finish_reshape = raid5_finish_reshape, |
| 5682 | .quiesce = raid5_quiesce, | 5690 | .quiesce = raid5_quiesce, |
| 5683 | .takeover = raid5_takeover, | 5691 | .takeover = raid5_takeover, |
| 5684 | .reconfig = raid5_reconfig, | ||
| 5685 | }; | 5692 | }; |
| 5686 | 5693 | ||
| 5687 | static struct mdk_personality raid4_personality = | 5694 | static struct mdk_personality raid4_personality = |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 116d0b44b2a9..2390e0e83daf 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
| @@ -337,7 +337,8 @@ struct raid5_private_data { | |||
| 337 | struct hlist_head *stripe_hashtbl; | 337 | struct hlist_head *stripe_hashtbl; |
| 338 | mddev_t *mddev; | 338 | mddev_t *mddev; |
| 339 | struct disk_info *spare; | 339 | struct disk_info *spare; |
| 340 | int chunk_size, level, algorithm; | 340 | int chunk_sectors; |
| 341 | int level, algorithm; | ||
| 341 | int max_degraded; | 342 | int max_degraded; |
| 342 | int raid_disks; | 343 | int raid_disks; |
| 343 | int max_nr_stripes; | 344 | int max_nr_stripes; |
| @@ -353,7 +354,8 @@ struct raid5_private_data { | |||
| 353 | */ | 354 | */ |
| 354 | sector_t reshape_safe; | 355 | sector_t reshape_safe; |
| 355 | int previous_raid_disks; | 356 | int previous_raid_disks; |
| 356 | int prev_chunk, prev_algo; | 357 | int prev_chunk_sectors; |
| 358 | int prev_algo; | ||
| 357 | short generation; /* increments with every reshape */ | 359 | short generation; /* increments with every reshape */ |
| 358 | unsigned long reshape_checkpoint; /* Time we last updated | 360 | unsigned long reshape_checkpoint; /* Time we last updated |
| 359 | * metadata */ | 361 | * metadata */ |
| @@ -424,8 +426,6 @@ struct raid5_private_data { | |||
| 424 | 426 | ||
| 425 | typedef struct raid5_private_data raid5_conf_t; | 427 | typedef struct raid5_private_data raid5_conf_t; |
| 426 | 428 | ||
| 427 | #define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) | ||
| 428 | |||
| 429 | /* | 429 | /* |
| 430 | * Our supported algorithms | 430 | * Our supported algorithms |
| 431 | */ | 431 | */ |
