aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig30
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/bitmap.c4
-rw-r--r--drivers/md/dm-crypt.c19
-rw-r--r--drivers/md/dm-delay.c26
-rw-r--r--drivers/md/dm-exception-store.c4
-rw-r--r--drivers/md/dm-exception-store.h4
-rw-r--r--drivers/md/dm-io.c14
-rw-r--r--drivers/md/dm-ioctl.c28
-rw-r--r--drivers/md/dm-linear.c15
-rw-r--r--drivers/md/dm-log-userspace-base.c696
-rw-r--r--drivers/md/dm-log-userspace-transfer.c276
-rw-r--r--drivers/md/dm-log-userspace-transfer.h18
-rw-r--r--drivers/md/dm-log.c8
-rw-r--r--drivers/md/dm-mpath.c334
-rw-r--r--drivers/md/dm-path-selector.h8
-rw-r--r--drivers/md/dm-queue-length.c263
-rw-r--r--drivers/md/dm-raid1.c17
-rw-r--r--drivers/md/dm-region-hash.c2
-rw-r--r--drivers/md/dm-round-robin.c2
-rw-r--r--drivers/md/dm-service-time.c339
-rw-r--r--drivers/md/dm-snap-persistent.c4
-rw-r--r--drivers/md/dm-snap.c11
-rw-r--r--drivers/md/dm-stripe.c33
-rw-r--r--drivers/md/dm-sysfs.c9
-rw-r--r--drivers/md/dm-table.c463
-rw-r--r--drivers/md/dm.c1142
-rw-r--r--drivers/md/dm.h35
-rw-r--r--drivers/md/faulty.c21
-rw-r--r--drivers/md/linear.c220
-rw-r--r--drivers/md/linear.h12
-rw-r--r--drivers/md/md.c198
-rw-r--r--drivers/md/md.h14
-rw-r--r--drivers/md/multipath.c27
-rw-r--r--drivers/md/multipath.h6
-rw-r--r--drivers/md/raid0.c405
-rw-r--r--drivers/md/raid0.h10
-rw-r--r--drivers/md/raid1.c50
-rw-r--r--drivers/md/raid1.h6
-rw-r--r--drivers/md/raid10.c70
-rw-r--r--drivers/md/raid10.h6
-rw-r--r--drivers/md/raid5.c223
-rw-r--r--drivers/md/raid5.h8
43 files changed, 3987 insertions, 1098 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 09c0c6e49ab5..2158377a1359 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -257,6 +257,17 @@ config DM_MIRROR
257 Allow volume managers to mirror logical volumes, also 257 Allow volume managers to mirror logical volumes, also
258 needed for live data migration tools such as 'pvmove'. 258 needed for live data migration tools such as 'pvmove'.
259 259
260config DM_LOG_USERSPACE
261 tristate "Mirror userspace logging (EXPERIMENTAL)"
262 depends on DM_MIRROR && EXPERIMENTAL && NET
263 select CONNECTOR
264 ---help---
265 The userspace logging module provides a mechanism for
266 relaying the dm-dirty-log API to userspace. Log designs
267 which are more suited to userspace implementation (e.g.
268 shared storage logs) or experimental logs can be implemented
269 by leveraging this framework.
270
260config DM_ZERO 271config DM_ZERO
261 tristate "Zero target" 272 tristate "Zero target"
262 depends on BLK_DEV_DM 273 depends on BLK_DEV_DM
@@ -275,6 +286,25 @@ config DM_MULTIPATH
275 ---help--- 286 ---help---
276 Allow volume managers to support multipath hardware. 287 Allow volume managers to support multipath hardware.
277 288
289config DM_MULTIPATH_QL
290 tristate "I/O Path Selector based on the number of in-flight I/Os"
291 depends on DM_MULTIPATH
292 ---help---
293 This path selector is a dynamic load balancer which selects
294 the path with the least number of in-flight I/Os.
295
296 If unsure, say N.
297
298config DM_MULTIPATH_ST
299 tristate "I/O Path Selector based on the service time"
300 depends on DM_MULTIPATH
301 ---help---
302 This path selector is a dynamic load balancer which selects
303 the path expected to complete the incoming I/O in the shortest
304 time.
305
306 If unsure, say N.
307
278config DM_DELAY 308config DM_DELAY
279 tristate "I/O delaying target (EXPERIMENTAL)" 309 tristate "I/O delaying target (EXPERIMENTAL)"
280 depends on BLK_DEV_DM && EXPERIMENTAL 310 depends on BLK_DEV_DM && EXPERIMENTAL
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 45cc5951d928..1dc4185bd781 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,6 +8,8 @@ dm-multipath-y += dm-path-selector.o dm-mpath.o
8dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ 8dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
9 dm-snap-persistent.o 9 dm-snap-persistent.o
10dm-mirror-y += dm-raid1.o 10dm-mirror-y += dm-raid1.o
11dm-log-userspace-y \
12 += dm-log-userspace-base.o dm-log-userspace-transfer.o
11md-mod-y += md.o bitmap.o 13md-mod-y += md.o bitmap.o
12raid456-y += raid5.o 14raid456-y += raid5.o
13raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ 15raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \
@@ -36,8 +38,11 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
36obj-$(CONFIG_DM_CRYPT) += dm-crypt.o 38obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
37obj-$(CONFIG_DM_DELAY) += dm-delay.o 39obj-$(CONFIG_DM_DELAY) += dm-delay.o
38obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o 40obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
41obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
42obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
39obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o 43obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
40obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o 44obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
45obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
41obj-$(CONFIG_DM_ZERO) += dm-zero.o 46obj-$(CONFIG_DM_ZERO) += dm-zero.o
42 47
43quiet_cmd_unroll = UNROLL $@ 48quiet_cmd_unroll = UNROLL $@
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 56df1cee8fb3..3319c2fec28e 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -232,7 +232,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset,
232 target = rdev->sb_start + offset + index * (PAGE_SIZE/512); 232 target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
233 233
234 if (sync_page_io(rdev->bdev, target, 234 if (sync_page_io(rdev->bdev, target,
235 roundup(size, bdev_hardsect_size(rdev->bdev)), 235 roundup(size, bdev_logical_block_size(rdev->bdev)),
236 page, READ)) { 236 page, READ)) {
237 page->index = index; 237 page->index = index;
238 attach_page_buffers(page, NULL); /* so that free_buffer will 238 attach_page_buffers(page, NULL); /* so that free_buffer will
@@ -287,7 +287,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
287 int size = PAGE_SIZE; 287 int size = PAGE_SIZE;
288 if (page->index == bitmap->file_pages-1) 288 if (page->index == bitmap->file_pages-1)
289 size = roundup(bitmap->last_page_size, 289 size = roundup(bitmap->last_page_size,
290 bdev_hardsect_size(rdev->bdev)); 290 bdev_logical_block_size(rdev->bdev));
291 /* Just make sure we aren't corrupting data or 291 /* Just make sure we aren't corrupting data or
292 * metadata 292 * metadata
293 */ 293 */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 53394e863c74..9933eb861c71 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1132 goto bad_crypt_queue; 1132 goto bad_crypt_queue;
1133 } 1133 }
1134 1134
1135 ti->num_flush_requests = 1;
1135 ti->private = cc; 1136 ti->private = cc;
1136 return 0; 1137 return 0;
1137 1138
@@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1189 union map_info *map_context) 1190 union map_info *map_context)
1190{ 1191{
1191 struct dm_crypt_io *io; 1192 struct dm_crypt_io *io;
1193 struct crypt_config *cc;
1194
1195 if (unlikely(bio_empty_barrier(bio))) {
1196 cc = ti->private;
1197 bio->bi_bdev = cc->dev->bdev;
1198 return DM_MAPIO_REMAPPED;
1199 }
1192 1200
1193 io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); 1201 io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin);
1194 1202
@@ -1305,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
1305 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 1313 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
1306} 1314}
1307 1315
1316static int crypt_iterate_devices(struct dm_target *ti,
1317 iterate_devices_callout_fn fn, void *data)
1318{
1319 struct crypt_config *cc = ti->private;
1320
1321 return fn(ti, cc->dev, cc->start, data);
1322}
1323
1308static struct target_type crypt_target = { 1324static struct target_type crypt_target = {
1309 .name = "crypt", 1325 .name = "crypt",
1310 .version= {1, 6, 0}, 1326 .version = {1, 7, 0},
1311 .module = THIS_MODULE, 1327 .module = THIS_MODULE,
1312 .ctr = crypt_ctr, 1328 .ctr = crypt_ctr,
1313 .dtr = crypt_dtr, 1329 .dtr = crypt_dtr,
@@ -1318,6 +1334,7 @@ static struct target_type crypt_target = {
1318 .resume = crypt_resume, 1334 .resume = crypt_resume,
1319 .message = crypt_message, 1335 .message = crypt_message,
1320 .merge = crypt_merge, 1336 .merge = crypt_merge,
1337 .iterate_devices = crypt_iterate_devices,
1321}; 1338};
1322 1339
1323static int __init dm_crypt_init(void) 1340static int __init dm_crypt_init(void)
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 559dbb52bc85..4e5b843cd4d7 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -197,6 +197,7 @@ out:
197 mutex_init(&dc->timer_lock); 197 mutex_init(&dc->timer_lock);
198 atomic_set(&dc->may_delay, 1); 198 atomic_set(&dc->may_delay, 1);
199 199
200 ti->num_flush_requests = 1;
200 ti->private = dc; 201 ti->private = dc;
201 return 0; 202 return 0;
202 203
@@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio,
278 279
279 if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { 280 if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
280 bio->bi_bdev = dc->dev_write->bdev; 281 bio->bi_bdev = dc->dev_write->bdev;
281 bio->bi_sector = dc->start_write + 282 if (bio_sectors(bio))
282 (bio->bi_sector - ti->begin); 283 bio->bi_sector = dc->start_write +
284 (bio->bi_sector - ti->begin);
283 285
284 return delay_bio(dc, dc->write_delay, bio); 286 return delay_bio(dc, dc->write_delay, bio);
285 } 287 }
@@ -316,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type,
316 return 0; 318 return 0;
317} 319}
318 320
321static int delay_iterate_devices(struct dm_target *ti,
322 iterate_devices_callout_fn fn, void *data)
323{
324 struct delay_c *dc = ti->private;
325 int ret = 0;
326
327 ret = fn(ti, dc->dev_read, dc->start_read, data);
328 if (ret)
329 goto out;
330
331 if (dc->dev_write)
332 ret = fn(ti, dc->dev_write, dc->start_write, data);
333
334out:
335 return ret;
336}
337
319static struct target_type delay_target = { 338static struct target_type delay_target = {
320 .name = "delay", 339 .name = "delay",
321 .version = {1, 0, 2}, 340 .version = {1, 1, 0},
322 .module = THIS_MODULE, 341 .module = THIS_MODULE,
323 .ctr = delay_ctr, 342 .ctr = delay_ctr,
324 .dtr = delay_dtr, 343 .dtr = delay_dtr,
@@ -326,6 +345,7 @@ static struct target_type delay_target = {
326 .presuspend = delay_presuspend, 345 .presuspend = delay_presuspend,
327 .resume = delay_resume, 346 .resume = delay_resume,
328 .status = delay_status, 347 .status = delay_status,
348 .iterate_devices = delay_iterate_devices,
329}; 349};
330 350
331static int __init dm_delay_init(void) 351static int __init dm_delay_init(void)
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index a2e26c242141..c3ae51584b12 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -178,7 +178,7 @@ static int set_chunk_size(struct dm_exception_store *store,
178 } 178 }
179 179
180 /* Validate the chunk size against the device block size */ 180 /* Validate the chunk size against the device block size */
181 if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) { 181 if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
182 *error = "Chunk size is not a multiple of device blocksize"; 182 *error = "Chunk size is not a multiple of device blocksize";
183 return -EINVAL; 183 return -EINVAL;
184 } 184 }
@@ -216,7 +216,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
216 return -EINVAL; 216 return -EINVAL;
217 } 217 }
218 218
219 type = get_type(argv[1]); 219 type = get_type(&persistent);
220 if (!type) { 220 if (!type) {
221 ti->error = "Exception store type not recognised"; 221 ti->error = "Exception store type not recognised";
222 r = -EINVAL; 222 r = -EINVAL;
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 0a2e6e7f67b3..2442c8c07898 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -111,7 +111,7 @@ struct dm_exception_store {
111/* 111/*
112 * Funtions to manipulate consecutive chunks 112 * Funtions to manipulate consecutive chunks
113 */ 113 */
114# if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) 114# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
115# define DM_CHUNK_CONSECUTIVE_BITS 8 115# define DM_CHUNK_CONSECUTIVE_BITS 8
116# define DM_CHUNK_NUMBER_BITS 56 116# define DM_CHUNK_NUMBER_BITS 56
117 117
@@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
156 */ 156 */
157static inline sector_t get_dev_size(struct block_device *bdev) 157static inline sector_t get_dev_size(struct block_device *bdev)
158{ 158{
159 return bdev->bd_inode->i_size >> SECTOR_SHIFT; 159 return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
160} 160}
161 161
162static inline chunk_t sector_to_chunk(struct dm_exception_store *store, 162static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index e73aabd61cd7..3a2e6a2f8bdd 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,6 +22,7 @@ struct dm_io_client {
22/* FIXME: can we shrink this ? */ 22/* FIXME: can we shrink this ? */
23struct io { 23struct io {
24 unsigned long error_bits; 24 unsigned long error_bits;
25 unsigned long eopnotsupp_bits;
25 atomic_t count; 26 atomic_t count;
26 struct task_struct *sleeper; 27 struct task_struct *sleeper;
27 struct dm_io_client *client; 28 struct dm_io_client *client;
@@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio)
107 *---------------------------------------------------------------*/ 108 *---------------------------------------------------------------*/
108static void dec_count(struct io *io, unsigned int region, int error) 109static void dec_count(struct io *io, unsigned int region, int error)
109{ 110{
110 if (error) 111 if (error) {
111 set_bit(region, &io->error_bits); 112 set_bit(region, &io->error_bits);
113 if (error == -EOPNOTSUPP)
114 set_bit(region, &io->eopnotsupp_bits);
115 }
112 116
113 if (atomic_dec_and_test(&io->count)) { 117 if (atomic_dec_and_test(&io->count)) {
114 if (io->sleeper) 118 if (io->sleeper)
@@ -360,7 +364,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
360 return -EIO; 364 return -EIO;
361 } 365 }
362 366
367retry:
363 io.error_bits = 0; 368 io.error_bits = 0;
369 io.eopnotsupp_bits = 0;
364 atomic_set(&io.count, 1); /* see dispatch_io() */ 370 atomic_set(&io.count, 1); /* see dispatch_io() */
365 io.sleeper = current; 371 io.sleeper = current;
366 io.client = client; 372 io.client = client;
@@ -377,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
377 } 383 }
378 set_current_state(TASK_RUNNING); 384 set_current_state(TASK_RUNNING);
379 385
386 if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
387 rw &= ~(1 << BIO_RW_BARRIER);
388 goto retry;
389 }
390
380 if (error_bits) 391 if (error_bits)
381 *error_bits = io.error_bits; 392 *error_bits = io.error_bits;
382 393
@@ -397,6 +408,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
397 408
398 io = mempool_alloc(client->pool, GFP_NOIO); 409 io = mempool_alloc(client->pool, GFP_NOIO);
399 io->error_bits = 0; 410 io->error_bits = 0;
411 io->eopnotsupp_bits = 0;
400 atomic_set(&io->count, 1); /* see dispatch_io() */ 412 atomic_set(&io->count, 1); /* see dispatch_io() */
401 io->sleeper = NULL; 413 io->sleeper = NULL;
402 io->client = client; 414 io->client = client;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 823ceba6efa8..7f77f18fcafa 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -276,7 +276,7 @@ retry:
276 up_write(&_hash_lock); 276 up_write(&_hash_lock);
277} 277}
278 278
279static int dm_hash_rename(const char *old, const char *new) 279static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
280{ 280{
281 char *new_name, *old_name; 281 char *new_name, *old_name;
282 struct hash_cell *hc; 282 struct hash_cell *hc;
@@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new)
333 dm_table_put(table); 333 dm_table_put(table);
334 } 334 }
335 335
336 dm_kobject_uevent(hc->md); 336 dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie);
337 337
338 dm_put(hc->md); 338 dm_put(hc->md);
339 up_write(&_hash_lock); 339 up_write(&_hash_lock);
@@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
680 680
681 __hash_remove(hc); 681 __hash_remove(hc);
682 up_write(&_hash_lock); 682 up_write(&_hash_lock);
683
684 dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr);
685
683 dm_put(md); 686 dm_put(md);
684 param->data_size = 0; 687 param->data_size = 0;
685 return 0; 688 return 0;
@@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
715 return r; 718 return r;
716 719
717 param->data_size = 0; 720 param->data_size = 0;
718 return dm_hash_rename(param->name, new_name); 721 return dm_hash_rename(param->event_nr, param->name, new_name);
719} 722}
720 723
721static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) 724static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
@@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param)
842 if (dm_suspended(md)) 845 if (dm_suspended(md))
843 r = dm_resume(md); 846 r = dm_resume(md);
844 847
845 if (!r) 848
849 if (!r) {
850 dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
846 r = __dev_status(md, param); 851 r = __dev_status(md, param);
852 }
847 853
848 dm_put(md); 854 dm_put(md);
849 return r; 855 return r;
@@ -1044,6 +1050,12 @@ static int populate_table(struct dm_table *table,
1044 next = spec->next; 1050 next = spec->next;
1045 } 1051 }
1046 1052
1053 r = dm_table_set_type(table);
1054 if (r) {
1055 DMWARN("unable to set table type");
1056 return r;
1057 }
1058
1047 return dm_table_complete(table); 1059 return dm_table_complete(table);
1048} 1060}
1049 1061
@@ -1089,6 +1101,13 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1089 goto out; 1101 goto out;
1090 } 1102 }
1091 1103
1104 r = dm_table_alloc_md_mempools(t);
1105 if (r) {
1106 DMWARN("unable to allocate mempools for this table");
1107 dm_table_destroy(t);
1108 goto out;
1109 }
1110
1092 down_write(&_hash_lock); 1111 down_write(&_hash_lock);
1093 hc = dm_get_mdptr(md); 1112 hc = dm_get_mdptr(md);
1094 if (!hc || hc->md != md) { 1113 if (!hc || hc->md != md) {
@@ -1513,6 +1532,7 @@ static const struct file_operations _ctl_fops = {
1513static struct miscdevice _dm_misc = { 1532static struct miscdevice _dm_misc = {
1514 .minor = MISC_DYNAMIC_MINOR, 1533 .minor = MISC_DYNAMIC_MINOR,
1515 .name = DM_NAME, 1534 .name = DM_NAME,
1535 .devnode = "mapper/control",
1516 .fops = &_ctl_fops 1536 .fops = &_ctl_fops
1517}; 1537};
1518 1538
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 79fb53e51c70..9184b6deb868 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
53 goto bad; 53 goto bad;
54 } 54 }
55 55
56 ti->num_flush_requests = 1;
56 ti->private = lc; 57 ti->private = lc;
57 return 0; 58 return 0;
58 59
@@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
81 struct linear_c *lc = ti->private; 82 struct linear_c *lc = ti->private;
82 83
83 bio->bi_bdev = lc->dev->bdev; 84 bio->bi_bdev = lc->dev->bdev;
84 bio->bi_sector = linear_map_sector(ti, bio->bi_sector); 85 if (bio_sectors(bio))
86 bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
85} 87}
86 88
87static int linear_map(struct dm_target *ti, struct bio *bio, 89static int linear_map(struct dm_target *ti, struct bio *bio,
@@ -132,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
132 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 134 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
133} 135}
134 136
137static int linear_iterate_devices(struct dm_target *ti,
138 iterate_devices_callout_fn fn, void *data)
139{
140 struct linear_c *lc = ti->private;
141
142 return fn(ti, lc->dev, lc->start, data);
143}
144
135static struct target_type linear_target = { 145static struct target_type linear_target = {
136 .name = "linear", 146 .name = "linear",
137 .version= {1, 0, 3}, 147 .version = {1, 1, 0},
138 .module = THIS_MODULE, 148 .module = THIS_MODULE,
139 .ctr = linear_ctr, 149 .ctr = linear_ctr,
140 .dtr = linear_dtr, 150 .dtr = linear_dtr,
@@ -142,6 +152,7 @@ static struct target_type linear_target = {
142 .status = linear_status, 152 .status = linear_status,
143 .ioctl = linear_ioctl, 153 .ioctl = linear_ioctl,
144 .merge = linear_merge, 154 .merge = linear_merge,
155 .iterate_devices = linear_iterate_devices,
145}; 156};
146 157
147int __init dm_linear_init(void) 158int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
new file mode 100644
index 000000000000..e69b96560997
--- /dev/null
+++ b/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,696 @@
1/*
2 * Copyright (C) 2006-2009 Red Hat, Inc.
3 *
4 * This file is released under the LGPL.
5 */
6
7#include <linux/bio.h>
8#include <linux/dm-dirty-log.h>
9#include <linux/device-mapper.h>
10#include <linux/dm-log-userspace.h>
11
12#include "dm-log-userspace-transfer.h"
13
14struct flush_entry {
15 int type;
16 region_t region;
17 struct list_head list;
18};
19
20struct log_c {
21 struct dm_target *ti;
22 uint32_t region_size;
23 region_t region_count;
24 char uuid[DM_UUID_LEN];
25
26 char *usr_argv_str;
27 uint32_t usr_argc;
28
29 /*
30 * in_sync_hint gets set when doing is_remote_recovering. It
31 * represents the first region that needs recovery. IOW, the
32 * first zero bit of sync_bits. This can be useful for to limit
33 * traffic for calls like is_remote_recovering and get_resync_work,
34 * but be take care in its use for anything else.
35 */
36 uint64_t in_sync_hint;
37
38 spinlock_t flush_lock;
39 struct list_head flush_list; /* only for clear and mark requests */
40};
41
42static mempool_t *flush_entry_pool;
43
44static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
45{
46 return kmalloc(sizeof(struct flush_entry), gfp_mask);
47}
48
49static void flush_entry_free(void *element, void *pool_data)
50{
51 kfree(element);
52}
53
54static int userspace_do_request(struct log_c *lc, const char *uuid,
55 int request_type, char *data, size_t data_size,
56 char *rdata, size_t *rdata_size)
57{
58 int r;
59
60 /*
61 * If the server isn't there, -ESRCH is returned,
62 * and we must keep trying until the server is
63 * restored.
64 */
65retry:
66 r = dm_consult_userspace(uuid, request_type, data,
67 data_size, rdata, rdata_size);
68
69 if (r != -ESRCH)
70 return r;
71
72 DMERR(" Userspace log server not found.");
73 while (1) {
74 set_current_state(TASK_INTERRUPTIBLE);
75 schedule_timeout(2*HZ);
76 DMWARN("Attempting to contact userspace log server...");
77 r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str,
78 strlen(lc->usr_argv_str) + 1,
79 NULL, NULL);
80 if (!r)
81 break;
82 }
83 DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
84 r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL,
85 0, NULL, NULL);
86 if (!r)
87 goto retry;
88
89 DMERR("Error trying to resume userspace log: %d", r);
90
91 return -ESRCH;
92}
93
94static int build_constructor_string(struct dm_target *ti,
95 unsigned argc, char **argv,
96 char **ctr_str)
97{
98 int i, str_size;
99 char *str = NULL;
100
101 *ctr_str = NULL;
102
103 for (i = 0, str_size = 0; i < argc; i++)
104 str_size += strlen(argv[i]) + 1; /* +1 for space between args */
105
106 str_size += 20; /* Max number of chars in a printed u64 number */
107
108 str = kzalloc(str_size, GFP_KERNEL);
109 if (!str) {
110 DMWARN("Unable to allocate memory for constructor string");
111 return -ENOMEM;
112 }
113
114 for (i = 0, str_size = 0; i < argc; i++)
115 str_size += sprintf(str + str_size, "%s ", argv[i]);
116 str_size += sprintf(str + str_size, "%llu",
117 (unsigned long long)ti->len);
118
119 *ctr_str = str;
120 return str_size;
121}
122
123/*
124 * userspace_ctr
125 *
126 * argv contains:
127 * <UUID> <other args>
128 * Where 'other args' is the userspace implementation specific log
129 * arguments. An example might be:
130 * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
131 *
132 * So, this module will strip off the <UUID> for identification purposes
133 * when communicating with userspace about a log; but will pass on everything
134 * else.
135 */
136static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
137 unsigned argc, char **argv)
138{
139 int r = 0;
140 int str_size;
141 char *ctr_str = NULL;
142 struct log_c *lc = NULL;
143 uint64_t rdata;
144 size_t rdata_size = sizeof(rdata);
145
146 if (argc < 3) {
147 DMWARN("Too few arguments to userspace dirty log");
148 return -EINVAL;
149 }
150
151 lc = kmalloc(sizeof(*lc), GFP_KERNEL);
152 if (!lc) {
153 DMWARN("Unable to allocate userspace log context.");
154 return -ENOMEM;
155 }
156
157 lc->ti = ti;
158
159 if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
160 DMWARN("UUID argument too long.");
161 kfree(lc);
162 return -EINVAL;
163 }
164
165 strncpy(lc->uuid, argv[0], DM_UUID_LEN);
166 spin_lock_init(&lc->flush_lock);
167 INIT_LIST_HEAD(&lc->flush_list);
168
169 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
170 if (str_size < 0) {
171 kfree(lc);
172 return str_size;
173 }
174
175 /* Send table string */
176 r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR,
177 ctr_str, str_size, NULL, NULL);
178
179 if (r == -ESRCH) {
180 DMERR("Userspace log server not found");
181 goto out;
182 }
183
184 /* Since the region size does not change, get it now */
185 rdata_size = sizeof(rdata);
186 r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE,
187 NULL, 0, (char *)&rdata, &rdata_size);
188
189 if (r) {
190 DMERR("Failed to get region size of dirty log");
191 goto out;
192 }
193
194 lc->region_size = (uint32_t)rdata;
195 lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
196
197out:
198 if (r) {
199 kfree(lc);
200 kfree(ctr_str);
201 } else {
202 lc->usr_argv_str = ctr_str;
203 lc->usr_argc = argc;
204 log->context = lc;
205 }
206
207 return r;
208}
209
210static void userspace_dtr(struct dm_dirty_log *log)
211{
212 int r;
213 struct log_c *lc = log->context;
214
215 r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR,
216 NULL, 0,
217 NULL, NULL);
218
219 kfree(lc->usr_argv_str);
220 kfree(lc);
221
222 return;
223}
224
225static int userspace_presuspend(struct dm_dirty_log *log)
226{
227 int r;
228 struct log_c *lc = log->context;
229
230 r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND,
231 NULL, 0,
232 NULL, NULL);
233
234 return r;
235}
236
237static int userspace_postsuspend(struct dm_dirty_log *log)
238{
239 int r;
240 struct log_c *lc = log->context;
241
242 r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND,
243 NULL, 0,
244 NULL, NULL);
245
246 return r;
247}
248
249static int userspace_resume(struct dm_dirty_log *log)
250{
251 int r;
252 struct log_c *lc = log->context;
253
254 lc->in_sync_hint = 0;
255 r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME,
256 NULL, 0,
257 NULL, NULL);
258
259 return r;
260}
261
262static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
263{
264 struct log_c *lc = log->context;
265
266 return lc->region_size;
267}
268
269/*
270 * userspace_is_clean
271 *
272 * Check whether a region is clean. If there is any sort of
273 * failure when consulting the server, we return not clean.
274 *
275 * Returns: 1 if clean, 0 otherwise
276 */
277static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
278{
279 int r;
280 uint64_t region64 = (uint64_t)region;
281 int64_t is_clean;
282 size_t rdata_size;
283 struct log_c *lc = log->context;
284
285 rdata_size = sizeof(is_clean);
286 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
287 (char *)&region64, sizeof(region64),
288 (char *)&is_clean, &rdata_size);
289
290 return (r) ? 0 : (int)is_clean;
291}
292
293/*
294 * userspace_in_sync
295 *
296 * Check if the region is in-sync. If there is any sort
297 * of failure when consulting the server, we assume that
298 * the region is not in sync.
299 *
300 * If 'can_block' is set, return immediately
301 *
302 * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
303 */
304static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
305 int can_block)
306{
307 int r;
308 uint64_t region64 = region;
309 int64_t in_sync;
310 size_t rdata_size;
311 struct log_c *lc = log->context;
312
313 /*
314 * We can never respond directly - even if in_sync_hint is
315 * set. This is because another machine could see a device
316 * failure and mark the region out-of-sync. If we don't go
317 * to userspace to ask, we might think the region is in-sync
318 * and allow a read to pick up data that is stale. (This is
319 * very unlikely if a device actually fails; but it is very
320 * likely if a connection to one device from one machine fails.)
321 *
322 * There still might be a problem if the mirror caches the region
323 * state as in-sync... but then this call would not be made. So,
324 * that is a mirror problem.
325 */
326 if (!can_block)
327 return -EWOULDBLOCK;
328
329 rdata_size = sizeof(in_sync);
330 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
331 (char *)&region64, sizeof(region64),
332 (char *)&in_sync, &rdata_size);
333 return (r) ? 0 : (int)in_sync;
334}
335
336/*
337 * userspace_flush
338 *
339 * This function is ok to block.
340 * The flush happens in two stages. First, it sends all
341 * clear/mark requests that are on the list. Then it
342 * tells the server to commit them. This gives the
343 * server a chance to optimise the commit, instead of
344 * doing it for every request.
345 *
346 * Additionally, we could implement another thread that
347 * sends the requests up to the server - reducing the
348 * load on flush. Then the flush would have less in
349 * the list and be responsible for the finishing commit.
350 *
351 * Returns: 0 on success, < 0 on failure
352 */
353static int userspace_flush(struct dm_dirty_log *log)
354{
355 int r = 0;
356 unsigned long flags;
357 struct log_c *lc = log->context;
358 LIST_HEAD(flush_list);
359 struct flush_entry *fe, *tmp_fe;
360
361 spin_lock_irqsave(&lc->flush_lock, flags);
362 list_splice_init(&lc->flush_list, &flush_list);
363 spin_unlock_irqrestore(&lc->flush_lock, flags);
364
365 if (list_empty(&flush_list))
366 return 0;
367
368 /*
369 * FIXME: Count up requests, group request types,
370 * allocate memory to stick all requests in and
371 * send to server in one go. Failing the allocation,
372 * do it one by one.
373 */
374
375 list_for_each_entry(fe, &flush_list, list) {
376 r = userspace_do_request(lc, lc->uuid, fe->type,
377 (char *)&fe->region,
378 sizeof(fe->region),
379 NULL, NULL);
380 if (r)
381 goto fail;
382 }
383
384 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
385 NULL, 0, NULL, NULL);
386
387fail:
388 /*
389 * We can safely remove these entries, even if failure.
390 * Calling code will receive an error and will know that
391 * the log facility has failed.
392 */
393 list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
394 list_del(&fe->list);
395 mempool_free(fe, flush_entry_pool);
396 }
397
398 if (r)
399 dm_table_event(lc->ti->table);
400
401 return r;
402}
403
404/*
405 * userspace_mark_region
406 *
407 * This function should avoid blocking unless absolutely required.
408 * (Memory allocation is valid for blocking.)
409 */
410static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
411{
412 unsigned long flags;
413 struct log_c *lc = log->context;
414 struct flush_entry *fe;
415
416 /* Wait for an allocation, but _never_ fail */
417 fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
418 BUG_ON(!fe);
419
420 spin_lock_irqsave(&lc->flush_lock, flags);
421 fe->type = DM_ULOG_MARK_REGION;
422 fe->region = region;
423 list_add(&fe->list, &lc->flush_list);
424 spin_unlock_irqrestore(&lc->flush_lock, flags);
425
426 return;
427}
428
429/*
430 * userspace_clear_region
431 *
432 * This function must not block.
433 * So, the alloc can't block. In the worst case, it is ok to
434 * fail. It would simply mean we can't clear the region.
435 * Does nothing to current sync context, but does mean
436 * the region will be re-sync'ed on a reload of the mirror
437 * even though it is in-sync.
438 */
439static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
440{
441 unsigned long flags;
442 struct log_c *lc = log->context;
443 struct flush_entry *fe;
444
445 /*
446 * If we fail to allocate, we skip the clearing of
447 * the region. This doesn't hurt us in any way, except
448 * to cause the region to be resync'ed when the
449 * device is activated next time.
450 */
451 fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
452 if (!fe) {
453 DMERR("Failed to allocate memory to clear region.");
454 return;
455 }
456
457 spin_lock_irqsave(&lc->flush_lock, flags);
458 fe->type = DM_ULOG_CLEAR_REGION;
459 fe->region = region;
460 list_add(&fe->list, &lc->flush_list);
461 spin_unlock_irqrestore(&lc->flush_lock, flags);
462
463 return;
464}
465
466/*
467 * userspace_get_resync_work
468 *
469 * Get a region that needs recovery. It is valid to return
470 * an error for this function.
471 *
472 * Returns: 1 if region filled, 0 if no work, <0 on error
473 */
474static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
475{
476 int r;
477 size_t rdata_size;
478 struct log_c *lc = log->context;
479 struct {
480 int64_t i; /* 64-bit for mix arch compatibility */
481 region_t r;
482 } pkg;
483
484 if (lc->in_sync_hint >= lc->region_count)
485 return 0;
486
487 rdata_size = sizeof(pkg);
488 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
489 NULL, 0,
490 (char *)&pkg, &rdata_size);
491
492 *region = pkg.r;
493 return (r) ? r : (int)pkg.i;
494}
495
496/*
497 * userspace_set_region_sync
498 *
499 * Set the sync status of a given region. This function
500 * must not fail.
501 */
502static void userspace_set_region_sync(struct dm_dirty_log *log,
503 region_t region, int in_sync)
504{
505 int r;
506 struct log_c *lc = log->context;
507 struct {
508 region_t r;
509 int64_t i;
510 } pkg;
511
512 pkg.r = region;
513 pkg.i = (int64_t)in_sync;
514
515 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
516 (char *)&pkg, sizeof(pkg),
517 NULL, NULL);
518
519 /*
520 * It would be nice to be able to report failures.
521 * However, it is easy emough to detect and resolve.
522 */
523 return;
524}
525
526/*
527 * userspace_get_sync_count
528 *
529 * If there is any sort of failure when consulting the server,
530 * we assume that the sync count is zero.
531 *
532 * Returns: sync count on success, 0 on failure
533 */
534static region_t userspace_get_sync_count(struct dm_dirty_log *log)
535{
536 int r;
537 size_t rdata_size;
538 uint64_t sync_count;
539 struct log_c *lc = log->context;
540
541 rdata_size = sizeof(sync_count);
542 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
543 NULL, 0,
544 (char *)&sync_count, &rdata_size);
545
546 if (r)
547 return 0;
548
549 if (sync_count >= lc->region_count)
550 lc->in_sync_hint = lc->region_count;
551
552 return (region_t)sync_count;
553}
554
555/*
556 * userspace_status
557 *
558 * Returns: amount of space consumed
559 */
560static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
561 char *result, unsigned maxlen)
562{
563 int r = 0;
564 size_t sz = (size_t)maxlen;
565 struct log_c *lc = log->context;
566
567 switch (status_type) {
568 case STATUSTYPE_INFO:
569 r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
570 NULL, 0,
571 result, &sz);
572
573 if (r) {
574 sz = 0;
575 DMEMIT("%s 1 COM_FAILURE", log->type->name);
576 }
577 break;
578 case STATUSTYPE_TABLE:
579 sz = 0;
580 DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1,
581 lc->uuid, lc->usr_argv_str);
582 break;
583 }
584 return (r) ? 0 : (int)sz;
585}
586
587/*
588 * userspace_is_remote_recovering
589 *
590 * Returns: 1 if region recovering, 0 otherwise
591 */
592static int userspace_is_remote_recovering(struct dm_dirty_log *log,
593 region_t region)
594{
595 int r;
596 uint64_t region64 = region;
597 struct log_c *lc = log->context;
598 static unsigned long long limit;
599 struct {
600 int64_t is_recovering;
601 uint64_t in_sync_hint;
602 } pkg;
603 size_t rdata_size = sizeof(pkg);
604
605 /*
606 * Once the mirror has been reported to be in-sync,
607 * it will never again ask for recovery work. So,
608 * we can safely say there is not a remote machine
609 * recovering if the device is in-sync. (in_sync_hint
610 * must be reset at resume time.)
611 */
612 if (region < lc->in_sync_hint)
613 return 0;
614 else if (jiffies < limit)
615 return 1;
616
617 limit = jiffies + (HZ / 4);
618 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
619 (char *)&region64, sizeof(region64),
620 (char *)&pkg, &rdata_size);
621 if (r)
622 return 1;
623
624 lc->in_sync_hint = pkg.in_sync_hint;
625
626 return (int)pkg.is_recovering;
627}
628
629static struct dm_dirty_log_type _userspace_type = {
630 .name = "userspace",
631 .module = THIS_MODULE,
632 .ctr = userspace_ctr,
633 .dtr = userspace_dtr,
634 .presuspend = userspace_presuspend,
635 .postsuspend = userspace_postsuspend,
636 .resume = userspace_resume,
637 .get_region_size = userspace_get_region_size,
638 .is_clean = userspace_is_clean,
639 .in_sync = userspace_in_sync,
640 .flush = userspace_flush,
641 .mark_region = userspace_mark_region,
642 .clear_region = userspace_clear_region,
643 .get_resync_work = userspace_get_resync_work,
644 .set_region_sync = userspace_set_region_sync,
645 .get_sync_count = userspace_get_sync_count,
646 .status = userspace_status,
647 .is_remote_recovering = userspace_is_remote_recovering,
648};
649
650static int __init userspace_dirty_log_init(void)
651{
652 int r = 0;
653
654 flush_entry_pool = mempool_create(100, flush_entry_alloc,
655 flush_entry_free, NULL);
656
657 if (!flush_entry_pool) {
658 DMWARN("Unable to create flush_entry_pool: No memory.");
659 return -ENOMEM;
660 }
661
662 r = dm_ulog_tfr_init();
663 if (r) {
664 DMWARN("Unable to initialize userspace log communications");
665 mempool_destroy(flush_entry_pool);
666 return r;
667 }
668
669 r = dm_dirty_log_type_register(&_userspace_type);
670 if (r) {
671 DMWARN("Couldn't register userspace dirty log type");
672 dm_ulog_tfr_exit();
673 mempool_destroy(flush_entry_pool);
674 return r;
675 }
676
677 DMINFO("version 1.0.0 loaded");
678 return 0;
679}
680
681static void __exit userspace_dirty_log_exit(void)
682{
683 dm_dirty_log_type_unregister(&_userspace_type);
684 dm_ulog_tfr_exit();
685 mempool_destroy(flush_entry_pool);
686
687 DMINFO("version 1.0.0 unloaded");
688 return;
689}
690
691module_init(userspace_dirty_log_init);
692module_exit(userspace_dirty_log_exit);
693
694MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
695MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
696MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
new file mode 100644
index 000000000000..0ca1ee768a1f
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -0,0 +1,276 @@
1/*
2 * Copyright (C) 2006-2009 Red Hat, Inc.
3 *
4 * This file is released under the LGPL.
5 */
6
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <net/sock.h>
10#include <linux/workqueue.h>
11#include <linux/connector.h>
12#include <linux/device-mapper.h>
13#include <linux/dm-log-userspace.h>
14
15#include "dm-log-userspace-transfer.h"
16
17static uint32_t dm_ulog_seq;
18
19/*
20 * Netlink/Connector is an unreliable protocol. How long should
21 * we wait for a response before assuming it was lost and retrying?
22 * (If we do receive a response after this time, it will be discarded
23 * and the response to the resent request will be waited for.
24 */
25#define DM_ULOG_RETRY_TIMEOUT (15 * HZ)
26
27/*
28 * Pre-allocated space for speed
29 */
30#define DM_ULOG_PREALLOCED_SIZE 512
31static struct cn_msg *prealloced_cn_msg;
32static struct dm_ulog_request *prealloced_ulog_tfr;
33
34static struct cb_id ulog_cn_id = {
35 .idx = CN_IDX_DM,
36 .val = CN_VAL_DM_USERSPACE_LOG
37};
38
39static DEFINE_MUTEX(dm_ulog_lock);
40
41struct receiving_pkg {
42 struct list_head list;
43 struct completion complete;
44
45 uint32_t seq;
46
47 int error;
48 size_t *data_size;
49 char *data;
50};
51
52static DEFINE_SPINLOCK(receiving_list_lock);
53static struct list_head receiving_list;
54
55static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
56{
57 int r;
58 struct cn_msg *msg = prealloced_cn_msg;
59
60 memset(msg, 0, sizeof(struct cn_msg));
61
62 msg->id.idx = ulog_cn_id.idx;
63 msg->id.val = ulog_cn_id.val;
64 msg->ack = 0;
65 msg->seq = tfr->seq;
66 msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
67
68 r = cn_netlink_send(msg, 0, gfp_any());
69
70 return r;
71}
72
73/*
74 * Parameters for this function can be either msg or tfr, but not
75 * both. This function fills in the reply for a waiting request.
76 * If just msg is given, then the reply is simply an ACK from userspace
77 * that the request was received.
78 *
79 * Returns: 0 on success, -ENOENT on failure
80 */
81static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
82{
83 uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0;
84 struct receiving_pkg *pkg;
85
86 /*
87 * The 'receiving_pkg' entries in this list are statically
88 * allocated on the stack in 'dm_consult_userspace'.
89 * Each process that is waiting for a reply from the user
90 * space server will have an entry in this list.
91 *
92 * We are safe to do it this way because the stack space
93 * is unique to each process, but still addressable by
94 * other processes.
95 */
96 list_for_each_entry(pkg, &receiving_list, list) {
97 if (rtn_seq != pkg->seq)
98 continue;
99
100 if (msg) {
101 pkg->error = -msg->ack;
102 /*
103 * If we are trying again, we will need to know our
104 * storage capacity. Otherwise, along with the
105 * error code, we make explicit that we have no data.
106 */
107 if (pkg->error != -EAGAIN)
108 *(pkg->data_size) = 0;
109 } else if (tfr->data_size > *(pkg->data_size)) {
110 DMERR("Insufficient space to receive package [%u] "
111 "(%u vs %lu)", tfr->request_type,
112 tfr->data_size, *(pkg->data_size));
113
114 *(pkg->data_size) = 0;
115 pkg->error = -ENOSPC;
116 } else {
117 pkg->error = tfr->error;
118 memcpy(pkg->data, tfr->data, tfr->data_size);
119 *(pkg->data_size) = tfr->data_size;
120 }
121 complete(&pkg->complete);
122 return 0;
123 }
124
125 return -ENOENT;
126}
127
128/*
129 * This is the connector callback that delivers data
130 * that was sent from userspace.
131 */
132static void cn_ulog_callback(void *data)
133{
134 struct cn_msg *msg = (struct cn_msg *)data;
135 struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
136
137 spin_lock(&receiving_list_lock);
138 if (msg->len == 0)
139 fill_pkg(msg, NULL);
140 else if (msg->len < sizeof(*tfr))
141 DMERR("Incomplete message received (expected %u, got %u): [%u]",
142 (unsigned)sizeof(*tfr), msg->len, msg->seq);
143 else
144 fill_pkg(NULL, tfr);
145 spin_unlock(&receiving_list_lock);
146}
147
148/**
149 * dm_consult_userspace
150 * @uuid: log's uuid (must be DM_UUID_LEN in size)
151 * @request_type: found in include/linux/dm-log-userspace.h
152 * @data: data to tx to the server
153 * @data_size: size of data in bytes
154 * @rdata: place to put return data from server
155 * @rdata_size: value-result (amount of space given/amount of space used)
156 *
157 * rdata_size is undefined on failure.
158 *
159 * Memory used to communicate with userspace is zero'ed
160 * before populating to ensure that no unwanted bits leak
161 * from kernel space to user-space. All userspace log communications
162 * between kernel and user space go through this function.
163 *
164 * Returns: 0 on success, -EXXX on failure
165 **/
166int dm_consult_userspace(const char *uuid, int request_type,
167 char *data, size_t data_size,
168 char *rdata, size_t *rdata_size)
169{
170 int r = 0;
171 size_t dummy = 0;
172 int overhead_size =
173 sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg);
174 struct dm_ulog_request *tfr = prealloced_ulog_tfr;
175 struct receiving_pkg pkg;
176
177 if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
178 DMINFO("Size of tfr exceeds preallocated size");
179 return -EINVAL;
180 }
181
182 if (!rdata_size)
183 rdata_size = &dummy;
184resend:
185 /*
186 * We serialize the sending of requests so we can
187 * use the preallocated space.
188 */
189 mutex_lock(&dm_ulog_lock);
190
191 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size);
192 memcpy(tfr->uuid, uuid, DM_UUID_LEN);
193 tfr->seq = dm_ulog_seq++;
194
195 /*
196 * Must be valid request type (all other bits set to
197 * zero). This reserves other bits for possible future
198 * use.
199 */
200 tfr->request_type = request_type & DM_ULOG_REQUEST_MASK;
201
202 tfr->data_size = data_size;
203 if (data && data_size)
204 memcpy(tfr->data, data, data_size);
205
206 memset(&pkg, 0, sizeof(pkg));
207 init_completion(&pkg.complete);
208 pkg.seq = tfr->seq;
209 pkg.data_size = rdata_size;
210 pkg.data = rdata;
211 spin_lock(&receiving_list_lock);
212 list_add(&(pkg.list), &receiving_list);
213 spin_unlock(&receiving_list_lock);
214
215 r = dm_ulog_sendto_server(tfr);
216
217 mutex_unlock(&dm_ulog_lock);
218
219 if (r) {
220 DMERR("Unable to send log request [%u] to userspace: %d",
221 request_type, r);
222 spin_lock(&receiving_list_lock);
223 list_del_init(&(pkg.list));
224 spin_unlock(&receiving_list_lock);
225
226 goto out;
227 }
228
229 r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
230 spin_lock(&receiving_list_lock);
231 list_del_init(&(pkg.list));
232 spin_unlock(&receiving_list_lock);
233 if (!r) {
234 DMWARN("[%s] Request timed out: [%u/%u] - retrying",
235 (strlen(uuid) > 8) ?
236 (uuid + (strlen(uuid) - 8)) : (uuid),
237 request_type, pkg.seq);
238 goto resend;
239 }
240
241 r = pkg.error;
242 if (r == -EAGAIN)
243 goto resend;
244
245out:
246 return r;
247}
248
249int dm_ulog_tfr_init(void)
250{
251 int r;
252 void *prealloced;
253
254 INIT_LIST_HEAD(&receiving_list);
255
256 prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL);
257 if (!prealloced)
258 return -ENOMEM;
259
260 prealloced_cn_msg = prealloced;
261 prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg);
262
263 r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
264 if (r) {
265 cn_del_callback(&ulog_cn_id);
266 return r;
267 }
268
269 return 0;
270}
271
272void dm_ulog_tfr_exit(void)
273{
274 cn_del_callback(&ulog_cn_id);
275 kfree(prealloced_cn_msg);
276}
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h
new file mode 100644
index 000000000000..c26d8e4e2710
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) 2006-2009 Red Hat, Inc.
3 *
4 * This file is released under the LGPL.
5 */
6
7#ifndef __DM_LOG_USERSPACE_TRANSFER_H__
8#define __DM_LOG_USERSPACE_TRANSFER_H__
9
10#define DM_MSG_PREFIX "dm-log-userspace"
11
12int dm_ulog_tfr_init(void);
13void dm_ulog_tfr_exit(void);
14int dm_consult_userspace(const char *uuid, int request_type,
15 char *data, size_t data_size,
16 char *rdata, size_t *rdata_size);
17
18#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index be233bc4d917..9443896ede07 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -412,10 +412,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
412 /* 412 /*
413 * Buffer holds both header and bitset. 413 * Buffer holds both header and bitset.
414 */ 414 */
415 buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + 415 buf_size =
416 bitset_size, ti->limits.hardsect_size); 416 dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size,
417 bdev_logical_block_size(lc->header_location.
418 bdev));
417 419
418 if (buf_size > dev->bdev->bd_inode->i_size) { 420 if (buf_size > i_size_read(dev->bdev->bd_inode)) {
419 DMWARN("log device %s too small: need %llu bytes", 421 DMWARN("log device %s too small: need %llu bytes",
420 dev->name, (unsigned long long)buf_size); 422 dev->name, (unsigned long long)buf_size);
421 kfree(lc); 423 kfree(lc);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6a386ab4f7eb..c70604a20897 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -8,7 +8,6 @@
8#include <linux/device-mapper.h> 8#include <linux/device-mapper.h>
9 9
10#include "dm-path-selector.h" 10#include "dm-path-selector.h"
11#include "dm-bio-record.h"
12#include "dm-uevent.h" 11#include "dm-uevent.h"
13 12
14#include <linux/ctype.h> 13#include <linux/ctype.h>
@@ -35,6 +34,7 @@ struct pgpath {
35 34
36 struct dm_path path; 35 struct dm_path path;
37 struct work_struct deactivate_path; 36 struct work_struct deactivate_path;
37 struct work_struct activate_path;
38}; 38};
39 39
40#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) 40#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -64,8 +64,6 @@ struct multipath {
64 spinlock_t lock; 64 spinlock_t lock;
65 65
66 const char *hw_handler_name; 66 const char *hw_handler_name;
67 struct work_struct activate_path;
68 struct pgpath *pgpath_to_activate;
69 unsigned nr_priority_groups; 67 unsigned nr_priority_groups;
70 struct list_head priority_groups; 68 struct list_head priority_groups;
71 unsigned pg_init_required; /* pg_init needs calling? */ 69 unsigned pg_init_required; /* pg_init needs calling? */
@@ -84,7 +82,7 @@ struct multipath {
84 unsigned pg_init_count; /* Number of times pg_init called */ 82 unsigned pg_init_count; /* Number of times pg_init called */
85 83
86 struct work_struct process_queued_ios; 84 struct work_struct process_queued_ios;
87 struct bio_list queued_ios; 85 struct list_head queued_ios;
88 unsigned queue_size; 86 unsigned queue_size;
89 87
90 struct work_struct trigger_event; 88 struct work_struct trigger_event;
@@ -101,7 +99,7 @@ struct multipath {
101 */ 99 */
102struct dm_mpath_io { 100struct dm_mpath_io {
103 struct pgpath *pgpath; 101 struct pgpath *pgpath;
104 struct dm_bio_details details; 102 size_t nr_bytes;
105}; 103};
106 104
107typedef int (*action_fn) (struct pgpath *pgpath); 105typedef int (*action_fn) (struct pgpath *pgpath);
@@ -128,6 +126,7 @@ static struct pgpath *alloc_pgpath(void)
128 if (pgpath) { 126 if (pgpath) {
129 pgpath->is_active = 1; 127 pgpath->is_active = 1;
130 INIT_WORK(&pgpath->deactivate_path, deactivate_path); 128 INIT_WORK(&pgpath->deactivate_path, deactivate_path);
129 INIT_WORK(&pgpath->activate_path, activate_path);
131 } 130 }
132 131
133 return pgpath; 132 return pgpath;
@@ -160,7 +159,6 @@ static struct priority_group *alloc_priority_group(void)
160 159
161static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) 160static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
162{ 161{
163 unsigned long flags;
164 struct pgpath *pgpath, *tmp; 162 struct pgpath *pgpath, *tmp;
165 struct multipath *m = ti->private; 163 struct multipath *m = ti->private;
166 164
@@ -169,10 +167,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
169 if (m->hw_handler_name) 167 if (m->hw_handler_name)
170 scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); 168 scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
171 dm_put_device(ti, pgpath->path.dev); 169 dm_put_device(ti, pgpath->path.dev);
172 spin_lock_irqsave(&m->lock, flags);
173 if (m->pgpath_to_activate == pgpath)
174 m->pgpath_to_activate = NULL;
175 spin_unlock_irqrestore(&m->lock, flags);
176 free_pgpath(pgpath); 170 free_pgpath(pgpath);
177 } 171 }
178} 172}
@@ -198,11 +192,11 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
198 m = kzalloc(sizeof(*m), GFP_KERNEL); 192 m = kzalloc(sizeof(*m), GFP_KERNEL);
199 if (m) { 193 if (m) {
200 INIT_LIST_HEAD(&m->priority_groups); 194 INIT_LIST_HEAD(&m->priority_groups);
195 INIT_LIST_HEAD(&m->queued_ios);
201 spin_lock_init(&m->lock); 196 spin_lock_init(&m->lock);
202 m->queue_io = 1; 197 m->queue_io = 1;
203 INIT_WORK(&m->process_queued_ios, process_queued_ios); 198 INIT_WORK(&m->process_queued_ios, process_queued_ios);
204 INIT_WORK(&m->trigger_event, trigger_event); 199 INIT_WORK(&m->trigger_event, trigger_event);
205 INIT_WORK(&m->activate_path, activate_path);
206 m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); 200 m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
207 if (!m->mpio_pool) { 201 if (!m->mpio_pool) {
208 kfree(m); 202 kfree(m);
@@ -250,11 +244,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
250 m->pg_init_count = 0; 244 m->pg_init_count = 0;
251} 245}
252 246
253static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) 247static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
248 size_t nr_bytes)
254{ 249{
255 struct dm_path *path; 250 struct dm_path *path;
256 251
257 path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); 252 path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
258 if (!path) 253 if (!path)
259 return -ENXIO; 254 return -ENXIO;
260 255
@@ -266,7 +261,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
266 return 0; 261 return 0;
267} 262}
268 263
269static void __choose_pgpath(struct multipath *m) 264static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
270{ 265{
271 struct priority_group *pg; 266 struct priority_group *pg;
272 unsigned bypassed = 1; 267 unsigned bypassed = 1;
@@ -278,12 +273,12 @@ static void __choose_pgpath(struct multipath *m)
278 if (m->next_pg) { 273 if (m->next_pg) {
279 pg = m->next_pg; 274 pg = m->next_pg;
280 m->next_pg = NULL; 275 m->next_pg = NULL;
281 if (!__choose_path_in_pg(m, pg)) 276 if (!__choose_path_in_pg(m, pg, nr_bytes))
282 return; 277 return;
283 } 278 }
284 279
285 /* Don't change PG until it has no remaining paths */ 280 /* Don't change PG until it has no remaining paths */
286 if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) 281 if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
287 return; 282 return;
288 283
289 /* 284 /*
@@ -295,7 +290,7 @@ static void __choose_pgpath(struct multipath *m)
295 list_for_each_entry(pg, &m->priority_groups, list) { 290 list_for_each_entry(pg, &m->priority_groups, list) {
296 if (pg->bypassed == bypassed) 291 if (pg->bypassed == bypassed)
297 continue; 292 continue;
298 if (!__choose_path_in_pg(m, pg)) 293 if (!__choose_path_in_pg(m, pg, nr_bytes))
299 return; 294 return;
300 } 295 }
301 } while (bypassed--); 296 } while (bypassed--);
@@ -322,19 +317,21 @@ static int __must_push_back(struct multipath *m)
322 dm_noflush_suspending(m->ti)); 317 dm_noflush_suspending(m->ti));
323} 318}
324 319
325static int map_io(struct multipath *m, struct bio *bio, 320static int map_io(struct multipath *m, struct request *clone,
326 struct dm_mpath_io *mpio, unsigned was_queued) 321 struct dm_mpath_io *mpio, unsigned was_queued)
327{ 322{
328 int r = DM_MAPIO_REMAPPED; 323 int r = DM_MAPIO_REMAPPED;
324 size_t nr_bytes = blk_rq_bytes(clone);
329 unsigned long flags; 325 unsigned long flags;
330 struct pgpath *pgpath; 326 struct pgpath *pgpath;
327 struct block_device *bdev;
331 328
332 spin_lock_irqsave(&m->lock, flags); 329 spin_lock_irqsave(&m->lock, flags);
333 330
334 /* Do we need to select a new pgpath? */ 331 /* Do we need to select a new pgpath? */
335 if (!m->current_pgpath || 332 if (!m->current_pgpath ||
336 (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) 333 (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
337 __choose_pgpath(m); 334 __choose_pgpath(m, nr_bytes);
338 335
339 pgpath = m->current_pgpath; 336 pgpath = m->current_pgpath;
340 337
@@ -344,21 +341,28 @@ static int map_io(struct multipath *m, struct bio *bio,
344 if ((pgpath && m->queue_io) || 341 if ((pgpath && m->queue_io) ||
345 (!pgpath && m->queue_if_no_path)) { 342 (!pgpath && m->queue_if_no_path)) {
346 /* Queue for the daemon to resubmit */ 343 /* Queue for the daemon to resubmit */
347 bio_list_add(&m->queued_ios, bio); 344 list_add_tail(&clone->queuelist, &m->queued_ios);
348 m->queue_size++; 345 m->queue_size++;
349 if ((m->pg_init_required && !m->pg_init_in_progress) || 346 if ((m->pg_init_required && !m->pg_init_in_progress) ||
350 !m->queue_io) 347 !m->queue_io)
351 queue_work(kmultipathd, &m->process_queued_ios); 348 queue_work(kmultipathd, &m->process_queued_ios);
352 pgpath = NULL; 349 pgpath = NULL;
353 r = DM_MAPIO_SUBMITTED; 350 r = DM_MAPIO_SUBMITTED;
354 } else if (pgpath) 351 } else if (pgpath) {
355 bio->bi_bdev = pgpath->path.dev->bdev; 352 bdev = pgpath->path.dev->bdev;
356 else if (__must_push_back(m)) 353 clone->q = bdev_get_queue(bdev);
354 clone->rq_disk = bdev->bd_disk;
355 } else if (__must_push_back(m))
357 r = DM_MAPIO_REQUEUE; 356 r = DM_MAPIO_REQUEUE;
358 else 357 else
359 r = -EIO; /* Failed */ 358 r = -EIO; /* Failed */
360 359
361 mpio->pgpath = pgpath; 360 mpio->pgpath = pgpath;
361 mpio->nr_bytes = nr_bytes;
362
363 if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io)
364 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path,
365 nr_bytes);
362 366
363 spin_unlock_irqrestore(&m->lock, flags); 367 spin_unlock_irqrestore(&m->lock, flags);
364 368
@@ -396,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m)
396{ 400{
397 int r; 401 int r;
398 unsigned long flags; 402 unsigned long flags;
399 struct bio *bio = NULL, *next;
400 struct dm_mpath_io *mpio; 403 struct dm_mpath_io *mpio;
401 union map_info *info; 404 union map_info *info;
405 struct request *clone, *n;
406 LIST_HEAD(cl);
402 407
403 spin_lock_irqsave(&m->lock, flags); 408 spin_lock_irqsave(&m->lock, flags);
404 bio = bio_list_get(&m->queued_ios); 409 list_splice_init(&m->queued_ios, &cl);
405 spin_unlock_irqrestore(&m->lock, flags); 410 spin_unlock_irqrestore(&m->lock, flags);
406 411
407 while (bio) { 412 list_for_each_entry_safe(clone, n, &cl, queuelist) {
408 next = bio->bi_next; 413 list_del_init(&clone->queuelist);
409 bio->bi_next = NULL;
410 414
411 info = dm_get_mapinfo(bio); 415 info = dm_get_rq_mapinfo(clone);
412 mpio = info->ptr; 416 mpio = info->ptr;
413 417
414 r = map_io(m, bio, mpio, 1); 418 r = map_io(m, clone, mpio, 1);
415 if (r < 0) 419 if (r < 0) {
416 bio_endio(bio, r); 420 mempool_free(mpio, m->mpio_pool);
417 else if (r == DM_MAPIO_REMAPPED) 421 dm_kill_unmapped_request(clone, r);
418 generic_make_request(bio); 422 } else if (r == DM_MAPIO_REMAPPED)
419 else if (r == DM_MAPIO_REQUEUE) 423 dm_dispatch_request(clone);
420 bio_endio(bio, -EIO); 424 else if (r == DM_MAPIO_REQUEUE) {
421 425 mempool_free(mpio, m->mpio_pool);
422 bio = next; 426 dm_requeue_unmapped_request(clone);
427 }
423 } 428 }
424} 429}
425 430
@@ -427,8 +432,8 @@ static void process_queued_ios(struct work_struct *work)
427{ 432{
428 struct multipath *m = 433 struct multipath *m =
429 container_of(work, struct multipath, process_queued_ios); 434 container_of(work, struct multipath, process_queued_ios);
430 struct pgpath *pgpath = NULL; 435 struct pgpath *pgpath = NULL, *tmp;
431 unsigned init_required = 0, must_queue = 1; 436 unsigned must_queue = 1;
432 unsigned long flags; 437 unsigned long flags;
433 438
434 spin_lock_irqsave(&m->lock, flags); 439 spin_lock_irqsave(&m->lock, flags);
@@ -437,7 +442,7 @@ static void process_queued_ios(struct work_struct *work)
437 goto out; 442 goto out;
438 443
439 if (!m->current_pgpath) 444 if (!m->current_pgpath)
440 __choose_pgpath(m); 445 __choose_pgpath(m, 0);
441 446
442 pgpath = m->current_pgpath; 447 pgpath = m->current_pgpath;
443 448
@@ -446,19 +451,15 @@ static void process_queued_ios(struct work_struct *work)
446 must_queue = 0; 451 must_queue = 0;
447 452
448 if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { 453 if (m->pg_init_required && !m->pg_init_in_progress && pgpath) {
449 m->pgpath_to_activate = pgpath;
450 m->pg_init_count++; 454 m->pg_init_count++;
451 m->pg_init_required = 0; 455 m->pg_init_required = 0;
452 m->pg_init_in_progress = 1; 456 list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) {
453 init_required = 1; 457 if (queue_work(kmpath_handlerd, &tmp->activate_path))
458 m->pg_init_in_progress++;
459 }
454 } 460 }
455
456out: 461out:
457 spin_unlock_irqrestore(&m->lock, flags); 462 spin_unlock_irqrestore(&m->lock, flags);
458
459 if (init_required)
460 queue_work(kmpath_handlerd, &m->activate_path);
461
462 if (!must_queue) 463 if (!must_queue)
463 dispatch_queued_ios(m); 464 dispatch_queued_ios(m);
464} 465}
@@ -553,6 +554,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
553 return -EINVAL; 554 return -EINVAL;
554 } 555 }
555 556
557 if (ps_argc > as->argc) {
558 dm_put_path_selector(pst);
559 ti->error = "not enough arguments for path selector";
560 return -EINVAL;
561 }
562
556 r = pst->create(&pg->ps, ps_argc, as->argv); 563 r = pst->create(&pg->ps, ps_argc, as->argv);
557 if (r) { 564 if (r) {
558 dm_put_path_selector(pst); 565 dm_put_path_selector(pst);
@@ -591,9 +598,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
591 } 598 }
592 599
593 if (m->hw_handler_name) { 600 if (m->hw_handler_name) {
594 r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev), 601 struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
595 m->hw_handler_name); 602
603 r = scsi_dh_attach(q, m->hw_handler_name);
604 if (r == -EBUSY) {
605 /*
606 * Already attached to different hw_handler,
607 * try to reattach with correct one.
608 */
609 scsi_dh_detach(q);
610 r = scsi_dh_attach(q, m->hw_handler_name);
611 }
612
596 if (r < 0) { 613 if (r < 0) {
614 ti->error = "error attaching hardware handler";
597 dm_put_device(ti, p->path.dev); 615 dm_put_device(ti, p->path.dev);
598 goto bad; 616 goto bad;
599 } 617 }
@@ -699,6 +717,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
699 if (!hw_argc) 717 if (!hw_argc)
700 return 0; 718 return 0;
701 719
720 if (hw_argc > as->argc) {
721 ti->error = "not enough arguments for hardware handler";
722 return -EINVAL;
723 }
724
702 m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); 725 m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
703 request_module("scsi_dh_%s", m->hw_handler_name); 726 request_module("scsi_dh_%s", m->hw_handler_name);
704 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { 727 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
@@ -823,6 +846,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
823 goto bad; 846 goto bad;
824 } 847 }
825 848
849 ti->num_flush_requests = 1;
850
826 return 0; 851 return 0;
827 852
828 bad: 853 bad:
@@ -836,25 +861,29 @@ static void multipath_dtr(struct dm_target *ti)
836 861
837 flush_workqueue(kmpath_handlerd); 862 flush_workqueue(kmpath_handlerd);
838 flush_workqueue(kmultipathd); 863 flush_workqueue(kmultipathd);
864 flush_scheduled_work();
839 free_multipath(m); 865 free_multipath(m);
840} 866}
841 867
842/* 868/*
843 * Map bios, recording original fields for later in case we have to resubmit 869 * Map cloned requests
844 */ 870 */
845static int multipath_map(struct dm_target *ti, struct bio *bio, 871static int multipath_map(struct dm_target *ti, struct request *clone,
846 union map_info *map_context) 872 union map_info *map_context)
847{ 873{
848 int r; 874 int r;
849 struct dm_mpath_io *mpio; 875 struct dm_mpath_io *mpio;
850 struct multipath *m = (struct multipath *) ti->private; 876 struct multipath *m = (struct multipath *) ti->private;
851 877
852 mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); 878 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
853 dm_bio_record(&mpio->details, bio); 879 if (!mpio)
880 /* ENOMEM, requeue */
881 return DM_MAPIO_REQUEUE;
882 memset(mpio, 0, sizeof(*mpio));
854 883
855 map_context->ptr = mpio; 884 map_context->ptr = mpio;
856 bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); 885 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
857 r = map_io(m, bio, mpio, 0); 886 r = map_io(m, clone, mpio, 0);
858 if (r < 0 || r == DM_MAPIO_REQUEUE) 887 if (r < 0 || r == DM_MAPIO_REQUEUE)
859 mempool_free(mpio, m->mpio_pool); 888 mempool_free(mpio, m->mpio_pool);
860 889
@@ -924,9 +953,13 @@ static int reinstate_path(struct pgpath *pgpath)
924 953
925 pgpath->is_active = 1; 954 pgpath->is_active = 1;
926 955
927 m->current_pgpath = NULL; 956 if (!m->nr_valid_paths++ && m->queue_size) {
928 if (!m->nr_valid_paths++ && m->queue_size) 957 m->current_pgpath = NULL;
929 queue_work(kmultipathd, &m->process_queued_ios); 958 queue_work(kmultipathd, &m->process_queued_ios);
959 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
960 if (queue_work(kmpath_handlerd, &pgpath->activate_path))
961 m->pg_init_in_progress++;
962 }
930 963
931 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, 964 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
932 pgpath->path.dev->name, m->nr_valid_paths); 965 pgpath->path.dev->name, m->nr_valid_paths);
@@ -1102,87 +1135,70 @@ static void pg_init_done(struct dm_path *path, int errors)
1102 1135
1103 spin_lock_irqsave(&m->lock, flags); 1136 spin_lock_irqsave(&m->lock, flags);
1104 if (errors) { 1137 if (errors) {
1105 DMERR("Could not failover device. Error %d.", errors); 1138 if (pgpath == m->current_pgpath) {
1106 m->current_pgpath = NULL; 1139 DMERR("Could not failover device. Error %d.", errors);
1107 m->current_pg = NULL; 1140 m->current_pgpath = NULL;
1141 m->current_pg = NULL;
1142 }
1108 } else if (!m->pg_init_required) { 1143 } else if (!m->pg_init_required) {
1109 m->queue_io = 0; 1144 m->queue_io = 0;
1110 pg->bypassed = 0; 1145 pg->bypassed = 0;
1111 } 1146 }
1112 1147
1113 m->pg_init_in_progress = 0; 1148 m->pg_init_in_progress--;
1114 queue_work(kmultipathd, &m->process_queued_ios); 1149 if (!m->pg_init_in_progress)
1150 queue_work(kmultipathd, &m->process_queued_ios);
1115 spin_unlock_irqrestore(&m->lock, flags); 1151 spin_unlock_irqrestore(&m->lock, flags);
1116} 1152}
1117 1153
1118static void activate_path(struct work_struct *work) 1154static void activate_path(struct work_struct *work)
1119{ 1155{
1120 int ret; 1156 int ret;
1121 struct multipath *m = 1157 struct pgpath *pgpath =
1122 container_of(work, struct multipath, activate_path); 1158 container_of(work, struct pgpath, activate_path);
1123 struct dm_path *path;
1124 unsigned long flags;
1125 1159
1126 spin_lock_irqsave(&m->lock, flags); 1160 ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev));
1127 path = &m->pgpath_to_activate->path; 1161 pg_init_done(&pgpath->path, ret);
1128 m->pgpath_to_activate = NULL;
1129 spin_unlock_irqrestore(&m->lock, flags);
1130 if (!path)
1131 return;
1132 ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev));
1133 pg_init_done(path, ret);
1134} 1162}
1135 1163
1136/* 1164/*
1137 * end_io handling 1165 * end_io handling
1138 */ 1166 */
1139static int do_end_io(struct multipath *m, struct bio *bio, 1167static int do_end_io(struct multipath *m, struct request *clone,
1140 int error, struct dm_mpath_io *mpio) 1168 int error, struct dm_mpath_io *mpio)
1141{ 1169{
1170 /*
1171 * We don't queue any clone request inside the multipath target
1172 * during end I/O handling, since those clone requests don't have
1173 * bio clones. If we queue them inside the multipath target,
1174 * we need to make bio clones, that requires memory allocation.
1175 * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
1176 * don't have bio clones.)
1177 * Instead of queueing the clone request here, we queue the original
1178 * request into dm core, which will remake a clone request and
1179 * clone bios for it and resubmit it later.
1180 */
1181 int r = DM_ENDIO_REQUEUE;
1142 unsigned long flags; 1182 unsigned long flags;
1143 1183
1144 if (!error) 1184 if (!error && !clone->errors)
1145 return 0; /* I/O complete */ 1185 return 0; /* I/O complete */
1146 1186
1147 if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
1148 return error;
1149
1150 if (error == -EOPNOTSUPP) 1187 if (error == -EOPNOTSUPP)
1151 return error; 1188 return error;
1152 1189
1153 spin_lock_irqsave(&m->lock, flags);
1154 if (!m->nr_valid_paths) {
1155 if (__must_push_back(m)) {
1156 spin_unlock_irqrestore(&m->lock, flags);
1157 return DM_ENDIO_REQUEUE;
1158 } else if (!m->queue_if_no_path) {
1159 spin_unlock_irqrestore(&m->lock, flags);
1160 return -EIO;
1161 } else {
1162 spin_unlock_irqrestore(&m->lock, flags);
1163 goto requeue;
1164 }
1165 }
1166 spin_unlock_irqrestore(&m->lock, flags);
1167
1168 if (mpio->pgpath) 1190 if (mpio->pgpath)
1169 fail_path(mpio->pgpath); 1191 fail_path(mpio->pgpath);
1170 1192
1171 requeue:
1172 dm_bio_restore(&mpio->details, bio);
1173
1174 /* queue for the daemon to resubmit or fail */
1175 spin_lock_irqsave(&m->lock, flags); 1193 spin_lock_irqsave(&m->lock, flags);
1176 bio_list_add(&m->queued_ios, bio); 1194 if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m))
1177 m->queue_size++; 1195 r = -EIO;
1178 if (!m->queue_io)
1179 queue_work(kmultipathd, &m->process_queued_ios);
1180 spin_unlock_irqrestore(&m->lock, flags); 1196 spin_unlock_irqrestore(&m->lock, flags);
1181 1197
1182 return DM_ENDIO_INCOMPLETE; /* io not complete */ 1198 return r;
1183} 1199}
1184 1200
1185static int multipath_end_io(struct dm_target *ti, struct bio *bio, 1201static int multipath_end_io(struct dm_target *ti, struct request *clone,
1186 int error, union map_info *map_context) 1202 int error, union map_info *map_context)
1187{ 1203{
1188 struct multipath *m = ti->private; 1204 struct multipath *m = ti->private;
@@ -1191,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
1191 struct path_selector *ps; 1207 struct path_selector *ps;
1192 int r; 1208 int r;
1193 1209
1194 r = do_end_io(m, bio, error, mpio); 1210 r = do_end_io(m, clone, error, mpio);
1195 if (pgpath) { 1211 if (pgpath) {
1196 ps = &pgpath->pg->ps; 1212 ps = &pgpath->pg->ps;
1197 if (ps->type->end_io) 1213 if (ps->type->end_io)
1198 ps->type->end_io(ps, &pgpath->path); 1214 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1199 } 1215 }
1200 if (r != DM_ENDIO_INCOMPLETE) 1216 mempool_free(mpio, m->mpio_pool);
1201 mempool_free(mpio, m->mpio_pool);
1202 1217
1203 return r; 1218 return r;
1204} 1219}
@@ -1411,7 +1426,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1411 spin_lock_irqsave(&m->lock, flags); 1426 spin_lock_irqsave(&m->lock, flags);
1412 1427
1413 if (!m->current_pgpath) 1428 if (!m->current_pgpath)
1414 __choose_pgpath(m); 1429 __choose_pgpath(m, 0);
1415 1430
1416 if (m->current_pgpath) { 1431 if (m->current_pgpath) {
1417 bdev = m->current_pgpath->path.dev->bdev; 1432 bdev = m->current_pgpath->path.dev->bdev;
@@ -1428,22 +1443,113 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1428 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); 1443 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
1429} 1444}
1430 1445
1446static int multipath_iterate_devices(struct dm_target *ti,
1447 iterate_devices_callout_fn fn, void *data)
1448{
1449 struct multipath *m = ti->private;
1450 struct priority_group *pg;
1451 struct pgpath *p;
1452 int ret = 0;
1453
1454 list_for_each_entry(pg, &m->priority_groups, list) {
1455 list_for_each_entry(p, &pg->pgpaths, list) {
1456 ret = fn(ti, p->path.dev, ti->begin, data);
1457 if (ret)
1458 goto out;
1459 }
1460 }
1461
1462out:
1463 return ret;
1464}
1465
1466static int __pgpath_busy(struct pgpath *pgpath)
1467{
1468 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1469
1470 return dm_underlying_device_busy(q);
1471}
1472
1473/*
1474 * We return "busy", only when we can map I/Os but underlying devices
1475 * are busy (so even if we map I/Os now, the I/Os will wait on
1476 * the underlying queue).
1477 * In other words, if we want to kill I/Os or queue them inside us
1478 * due to map unavailability, we don't return "busy". Otherwise,
1479 * dm core won't give us the I/Os and we can't do what we want.
1480 */
1481static int multipath_busy(struct dm_target *ti)
1482{
1483 int busy = 0, has_active = 0;
1484 struct multipath *m = ti->private;
1485 struct priority_group *pg;
1486 struct pgpath *pgpath;
1487 unsigned long flags;
1488
1489 spin_lock_irqsave(&m->lock, flags);
1490
1491 /* Guess which priority_group will be used at next mapping time */
1492 if (unlikely(!m->current_pgpath && m->next_pg))
1493 pg = m->next_pg;
1494 else if (likely(m->current_pg))
1495 pg = m->current_pg;
1496 else
1497 /*
1498 * We don't know which pg will be used at next mapping time.
1499 * We don't call __choose_pgpath() here to avoid to trigger
1500 * pg_init just by busy checking.
1501 * So we don't know whether underlying devices we will be using
1502 * at next mapping time are busy or not. Just try mapping.
1503 */
1504 goto out;
1505
1506 /*
1507 * If there is one non-busy active path at least, the path selector
1508 * will be able to select it. So we consider such a pg as not busy.
1509 */
1510 busy = 1;
1511 list_for_each_entry(pgpath, &pg->pgpaths, list)
1512 if (pgpath->is_active) {
1513 has_active = 1;
1514
1515 if (!__pgpath_busy(pgpath)) {
1516 busy = 0;
1517 break;
1518 }
1519 }
1520
1521 if (!has_active)
1522 /*
1523 * No active path in this pg, so this pg won't be used and
1524 * the current_pg will be changed at next mapping time.
1525 * We need to try mapping to determine it.
1526 */
1527 busy = 0;
1528
1529out:
1530 spin_unlock_irqrestore(&m->lock, flags);
1531
1532 return busy;
1533}
1534
1431/*----------------------------------------------------------------- 1535/*-----------------------------------------------------------------
1432 * Module setup 1536 * Module setup
1433 *---------------------------------------------------------------*/ 1537 *---------------------------------------------------------------*/
1434static struct target_type multipath_target = { 1538static struct target_type multipath_target = {
1435 .name = "multipath", 1539 .name = "multipath",
1436 .version = {1, 0, 5}, 1540 .version = {1, 1, 0},
1437 .module = THIS_MODULE, 1541 .module = THIS_MODULE,
1438 .ctr = multipath_ctr, 1542 .ctr = multipath_ctr,
1439 .dtr = multipath_dtr, 1543 .dtr = multipath_dtr,
1440 .map = multipath_map, 1544 .map_rq = multipath_map,
1441 .end_io = multipath_end_io, 1545 .rq_end_io = multipath_end_io,
1442 .presuspend = multipath_presuspend, 1546 .presuspend = multipath_presuspend,
1443 .resume = multipath_resume, 1547 .resume = multipath_resume,
1444 .status = multipath_status, 1548 .status = multipath_status,
1445 .message = multipath_message, 1549 .message = multipath_message,
1446 .ioctl = multipath_ioctl, 1550 .ioctl = multipath_ioctl,
1551 .iterate_devices = multipath_iterate_devices,
1552 .busy = multipath_busy,
1447}; 1553};
1448 1554
1449static int __init dm_multipath_init(void) 1555static int __init dm_multipath_init(void)
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index 27357b85d73d..e7d1fa8b0459 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -56,7 +56,8 @@ struct path_selector_type {
56 * the path fails. 56 * the path fails.
57 */ 57 */
58 struct dm_path *(*select_path) (struct path_selector *ps, 58 struct dm_path *(*select_path) (struct path_selector *ps,
59 unsigned *repeat_count); 59 unsigned *repeat_count,
60 size_t nr_bytes);
60 61
61 /* 62 /*
62 * Notify the selector that a path has failed. 63 * Notify the selector that a path has failed.
@@ -75,7 +76,10 @@ struct path_selector_type {
75 int (*status) (struct path_selector *ps, struct dm_path *path, 76 int (*status) (struct path_selector *ps, struct dm_path *path,
76 status_type_t type, char *result, unsigned int maxlen); 77 status_type_t type, char *result, unsigned int maxlen);
77 78
78 int (*end_io) (struct path_selector *ps, struct dm_path *path); 79 int (*start_io) (struct path_selector *ps, struct dm_path *path,
80 size_t nr_bytes);
81 int (*end_io) (struct path_selector *ps, struct dm_path *path,
82 size_t nr_bytes);
79}; 83};
80 84
81/* Register a path selector */ 85/* Register a path selector */
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
new file mode 100644
index 000000000000..f92b6cea9d9c
--- /dev/null
+++ b/drivers/md/dm-queue-length.c
@@ -0,0 +1,263 @@
1/*
2 * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved.
3 * Copyright (C) 2006-2009 NEC Corporation.
4 *
5 * dm-queue-length.c
6 *
7 * Module Author: Stefan Bader, IBM
8 * Modified by: Kiyoshi Ueda, NEC
9 *
10 * This file is released under the GPL.
11 *
12 * queue-length path selector - choose a path with the least number of
13 * in-flight I/Os.
14 */
15
16#include "dm.h"
17#include "dm-path-selector.h"
18
19#include <linux/slab.h>
20#include <linux/ctype.h>
21#include <linux/errno.h>
22#include <linux/module.h>
23#include <asm/atomic.h>
24
25#define DM_MSG_PREFIX "multipath queue-length"
26#define QL_MIN_IO 128
27#define QL_VERSION "0.1.0"
28
29struct selector {
30 struct list_head valid_paths;
31 struct list_head failed_paths;
32};
33
34struct path_info {
35 struct list_head list;
36 struct dm_path *path;
37 unsigned repeat_count;
38 atomic_t qlen; /* the number of in-flight I/Os */
39};
40
41static struct selector *alloc_selector(void)
42{
43 struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
44
45 if (s) {
46 INIT_LIST_HEAD(&s->valid_paths);
47 INIT_LIST_HEAD(&s->failed_paths);
48 }
49
50 return s;
51}
52
53static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
54{
55 struct selector *s = alloc_selector();
56
57 if (!s)
58 return -ENOMEM;
59
60 ps->context = s;
61 return 0;
62}
63
64static void ql_free_paths(struct list_head *paths)
65{
66 struct path_info *pi, *next;
67
68 list_for_each_entry_safe(pi, next, paths, list) {
69 list_del(&pi->list);
70 kfree(pi);
71 }
72}
73
74static void ql_destroy(struct path_selector *ps)
75{
76 struct selector *s = ps->context;
77
78 ql_free_paths(&s->valid_paths);
79 ql_free_paths(&s->failed_paths);
80 kfree(s);
81 ps->context = NULL;
82}
83
84static int ql_status(struct path_selector *ps, struct dm_path *path,
85 status_type_t type, char *result, unsigned maxlen)
86{
87 unsigned sz = 0;
88 struct path_info *pi;
89
90 /* When called with NULL path, return selector status/args. */
91 if (!path)
92 DMEMIT("0 ");
93 else {
94 pi = path->pscontext;
95
96 switch (type) {
97 case STATUSTYPE_INFO:
98 DMEMIT("%d ", atomic_read(&pi->qlen));
99 break;
100 case STATUSTYPE_TABLE:
101 DMEMIT("%u ", pi->repeat_count);
102 break;
103 }
104 }
105
106 return sz;
107}
108
109static int ql_add_path(struct path_selector *ps, struct dm_path *path,
110 int argc, char **argv, char **error)
111{
112 struct selector *s = ps->context;
113 struct path_info *pi;
114 unsigned repeat_count = QL_MIN_IO;
115
116 /*
117 * Arguments: [<repeat_count>]
118 * <repeat_count>: The number of I/Os before switching path.
119 * If not given, default (QL_MIN_IO) is used.
120 */
121 if (argc > 1) {
122 *error = "queue-length ps: incorrect number of arguments";
123 return -EINVAL;
124 }
125
126 if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
127 *error = "queue-length ps: invalid repeat count";
128 return -EINVAL;
129 }
130
131 /* Allocate the path information structure */
132 pi = kmalloc(sizeof(*pi), GFP_KERNEL);
133 if (!pi) {
134 *error = "queue-length ps: Error allocating path information";
135 return -ENOMEM;
136 }
137
138 pi->path = path;
139 pi->repeat_count = repeat_count;
140 atomic_set(&pi->qlen, 0);
141
142 path->pscontext = pi;
143
144 list_add_tail(&pi->list, &s->valid_paths);
145
146 return 0;
147}
148
149static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
150{
151 struct selector *s = ps->context;
152 struct path_info *pi = path->pscontext;
153
154 list_move(&pi->list, &s->failed_paths);
155}
156
157static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
158{
159 struct selector *s = ps->context;
160 struct path_info *pi = path->pscontext;
161
162 list_move_tail(&pi->list, &s->valid_paths);
163
164 return 0;
165}
166
167/*
168 * Select a path having the minimum number of in-flight I/Os
169 */
170static struct dm_path *ql_select_path(struct path_selector *ps,
171 unsigned *repeat_count, size_t nr_bytes)
172{
173 struct selector *s = ps->context;
174 struct path_info *pi = NULL, *best = NULL;
175
176 if (list_empty(&s->valid_paths))
177 return NULL;
178
179 /* Change preferred (first in list) path to evenly balance. */
180 list_move_tail(s->valid_paths.next, &s->valid_paths);
181
182 list_for_each_entry(pi, &s->valid_paths, list) {
183 if (!best ||
184 (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
185 best = pi;
186
187 if (!atomic_read(&best->qlen))
188 break;
189 }
190
191 if (!best)
192 return NULL;
193
194 *repeat_count = best->repeat_count;
195
196 return best->path;
197}
198
199static int ql_start_io(struct path_selector *ps, struct dm_path *path,
200 size_t nr_bytes)
201{
202 struct path_info *pi = path->pscontext;
203
204 atomic_inc(&pi->qlen);
205
206 return 0;
207}
208
209static int ql_end_io(struct path_selector *ps, struct dm_path *path,
210 size_t nr_bytes)
211{
212 struct path_info *pi = path->pscontext;
213
214 atomic_dec(&pi->qlen);
215
216 return 0;
217}
218
219static struct path_selector_type ql_ps = {
220 .name = "queue-length",
221 .module = THIS_MODULE,
222 .table_args = 1,
223 .info_args = 1,
224 .create = ql_create,
225 .destroy = ql_destroy,
226 .status = ql_status,
227 .add_path = ql_add_path,
228 .fail_path = ql_fail_path,
229 .reinstate_path = ql_reinstate_path,
230 .select_path = ql_select_path,
231 .start_io = ql_start_io,
232 .end_io = ql_end_io,
233};
234
235static int __init dm_ql_init(void)
236{
237 int r = dm_register_path_selector(&ql_ps);
238
239 if (r < 0)
240 DMERR("register failed %d", r);
241
242 DMINFO("version " QL_VERSION " loaded");
243
244 return r;
245}
246
247static void __exit dm_ql_exit(void)
248{
249 int r = dm_unregister_path_selector(&ql_ps);
250
251 if (r < 0)
252 DMERR("unregister failed %d", r);
253}
254
255module_init(dm_ql_init);
256module_exit(dm_ql_exit);
257
258MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
259MODULE_DESCRIPTION(
260 "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n"
261 DM_NAME " path selector to balance the number of in-flight I/Os"
262);
263MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 076fbb4e967a..ce8868c768cc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1283 return 0; 1283 return 0;
1284} 1284}
1285 1285
1286static int mirror_iterate_devices(struct dm_target *ti,
1287 iterate_devices_callout_fn fn, void *data)
1288{
1289 struct mirror_set *ms = ti->private;
1290 int ret = 0;
1291 unsigned i;
1292
1293 for (i = 0; !ret && i < ms->nr_mirrors; i++)
1294 ret = fn(ti, ms->mirror[i].dev,
1295 ms->mirror[i].offset, data);
1296
1297 return ret;
1298}
1299
1286static struct target_type mirror_target = { 1300static struct target_type mirror_target = {
1287 .name = "mirror", 1301 .name = "mirror",
1288 .version = {1, 0, 20}, 1302 .version = {1, 12, 0},
1289 .module = THIS_MODULE, 1303 .module = THIS_MODULE,
1290 .ctr = mirror_ctr, 1304 .ctr = mirror_ctr,
1291 .dtr = mirror_dtr, 1305 .dtr = mirror_dtr,
@@ -1295,6 +1309,7 @@ static struct target_type mirror_target = {
1295 .postsuspend = mirror_postsuspend, 1309 .postsuspend = mirror_postsuspend,
1296 .resume = mirror_resume, 1310 .resume = mirror_resume,
1297 .status = mirror_status, 1311 .status = mirror_status,
1312 .iterate_devices = mirror_iterate_devices,
1298}; 1313};
1299 1314
1300static int __init dm_mirror_init(void) 1315static int __init dm_mirror_init(void)
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 7b899be0b087..36dbe29f2fd6 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
283 283
284 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); 284 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
285 if (unlikely(!nreg)) 285 if (unlikely(!nreg))
286 nreg = kmalloc(sizeof(*nreg), GFP_NOIO); 286 nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
287 287
288 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? 288 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
289 DM_RH_CLEAN : DM_RH_NOSYNC; 289 DM_RH_CLEAN : DM_RH_NOSYNC;
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index cdfbf65b28cb..24752f449bef 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
161} 161}
162 162
163static struct dm_path *rr_select_path(struct path_selector *ps, 163static struct dm_path *rr_select_path(struct path_selector *ps,
164 unsigned *repeat_count) 164 unsigned *repeat_count, size_t nr_bytes)
165{ 165{
166 struct selector *s = (struct selector *) ps->context; 166 struct selector *s = (struct selector *) ps->context;
167 struct path_info *pi = NULL; 167 struct path_info *pi = NULL;
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
new file mode 100644
index 000000000000..cfa668f46c40
--- /dev/null
+++ b/drivers/md/dm-service-time.c
@@ -0,0 +1,339 @@
1/*
2 * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved.
3 *
4 * Module Author: Kiyoshi Ueda
5 *
6 * This file is released under the GPL.
7 *
8 * Throughput oriented path selector.
9 */
10
11#include "dm.h"
12#include "dm-path-selector.h"
13
14#define DM_MSG_PREFIX "multipath service-time"
15#define ST_MIN_IO 1
16#define ST_MAX_RELATIVE_THROUGHPUT 100
17#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7
18#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
19#define ST_VERSION "0.2.0"
20
21struct selector {
22 struct list_head valid_paths;
23 struct list_head failed_paths;
24};
25
26struct path_info {
27 struct list_head list;
28 struct dm_path *path;
29 unsigned repeat_count;
30 unsigned relative_throughput;
31 atomic_t in_flight_size; /* Total size of in-flight I/Os */
32};
33
34static struct selector *alloc_selector(void)
35{
36 struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
37
38 if (s) {
39 INIT_LIST_HEAD(&s->valid_paths);
40 INIT_LIST_HEAD(&s->failed_paths);
41 }
42
43 return s;
44}
45
46static int st_create(struct path_selector *ps, unsigned argc, char **argv)
47{
48 struct selector *s = alloc_selector();
49
50 if (!s)
51 return -ENOMEM;
52
53 ps->context = s;
54 return 0;
55}
56
57static void free_paths(struct list_head *paths)
58{
59 struct path_info *pi, *next;
60
61 list_for_each_entry_safe(pi, next, paths, list) {
62 list_del(&pi->list);
63 kfree(pi);
64 }
65}
66
67static void st_destroy(struct path_selector *ps)
68{
69 struct selector *s = ps->context;
70
71 free_paths(&s->valid_paths);
72 free_paths(&s->failed_paths);
73 kfree(s);
74 ps->context = NULL;
75}
76
77static int st_status(struct path_selector *ps, struct dm_path *path,
78 status_type_t type, char *result, unsigned maxlen)
79{
80 unsigned sz = 0;
81 struct path_info *pi;
82
83 if (!path)
84 DMEMIT("0 ");
85 else {
86 pi = path->pscontext;
87
88 switch (type) {
89 case STATUSTYPE_INFO:
90 DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
91 pi->relative_throughput);
92 break;
93 case STATUSTYPE_TABLE:
94 DMEMIT("%u %u ", pi->repeat_count,
95 pi->relative_throughput);
96 break;
97 }
98 }
99
100 return sz;
101}
102
103static int st_add_path(struct path_selector *ps, struct dm_path *path,
104 int argc, char **argv, char **error)
105{
106 struct selector *s = ps->context;
107 struct path_info *pi;
108 unsigned repeat_count = ST_MIN_IO;
109 unsigned relative_throughput = 1;
110
111 /*
112 * Arguments: [<repeat_count> [<relative_throughput>]]
113 * <repeat_count>: The number of I/Os before switching path.
114 * If not given, default (ST_MIN_IO) is used.
115 * <relative_throughput>: The relative throughput value of
116 * the path among all paths in the path-group.
117 * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
118 * If not given, minimum value '1' is used.
119 * If '0' is given, the path isn't selected while
120 * other paths having a positive value are
121 * available.
122 */
123 if (argc > 2) {
124 *error = "service-time ps: incorrect number of arguments";
125 return -EINVAL;
126 }
127
128 if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
129 *error = "service-time ps: invalid repeat count";
130 return -EINVAL;
131 }
132
133 if ((argc == 2) &&
134 (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
135 relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
136 *error = "service-time ps: invalid relative_throughput value";
137 return -EINVAL;
138 }
139
140 /* allocate the path */
141 pi = kmalloc(sizeof(*pi), GFP_KERNEL);
142 if (!pi) {
143 *error = "service-time ps: Error allocating path context";
144 return -ENOMEM;
145 }
146
147 pi->path = path;
148 pi->repeat_count = repeat_count;
149 pi->relative_throughput = relative_throughput;
150 atomic_set(&pi->in_flight_size, 0);
151
152 path->pscontext = pi;
153
154 list_add_tail(&pi->list, &s->valid_paths);
155
156 return 0;
157}
158
159static void st_fail_path(struct path_selector *ps, struct dm_path *path)
160{
161 struct selector *s = ps->context;
162 struct path_info *pi = path->pscontext;
163
164 list_move(&pi->list, &s->failed_paths);
165}
166
167static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
168{
169 struct selector *s = ps->context;
170 struct path_info *pi = path->pscontext;
171
172 list_move_tail(&pi->list, &s->valid_paths);
173
174 return 0;
175}
176
177/*
178 * Compare the estimated service time of 2 paths, pi1 and pi2,
179 * for the incoming I/O.
180 *
181 * Returns:
182 * < 0 : pi1 is better
183 * 0 : no difference between pi1 and pi2
184 * > 0 : pi2 is better
185 *
186 * Description:
187 * Basically, the service time is estimated by:
188 * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
189 * To reduce the calculation, some optimizations are made.
190 * (See comments inline)
191 */
192static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
193 size_t incoming)
194{
195 size_t sz1, sz2, st1, st2;
196
197 sz1 = atomic_read(&pi1->in_flight_size);
198 sz2 = atomic_read(&pi2->in_flight_size);
199
200 /*
201 * Case 1: Both have same throughput value. Choose less loaded path.
202 */
203 if (pi1->relative_throughput == pi2->relative_throughput)
204 return sz1 - sz2;
205
206 /*
207 * Case 2a: Both have same load. Choose higher throughput path.
208 * Case 2b: One path has no throughput value. Choose the other one.
209 */
210 if (sz1 == sz2 ||
211 !pi1->relative_throughput || !pi2->relative_throughput)
212 return pi2->relative_throughput - pi1->relative_throughput;
213
214 /*
215 * Case 3: Calculate service time. Choose faster path.
216 * Service time using pi1:
217 * st1 = (sz1 + incoming) / pi1->relative_throughput
218 * Service time using pi2:
219 * st2 = (sz2 + incoming) / pi2->relative_throughput
220 *
221 * To avoid the division, transform the expression to use
222 * multiplication.
223 * Because ->relative_throughput > 0 here, if st1 < st2,
224 * the expressions below are the same meaning:
225 * (sz1 + incoming) / pi1->relative_throughput <
226 * (sz2 + incoming) / pi2->relative_throughput
227 * (sz1 + incoming) * pi2->relative_throughput <
228 * (sz2 + incoming) * pi1->relative_throughput
229 * So use the later one.
230 */
231 sz1 += incoming;
232 sz2 += incoming;
233 if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
234 sz2 >= ST_MAX_INFLIGHT_SIZE)) {
235 /*
236 * Size may be too big for multiplying pi->relative_throughput
237 * and overflow.
238 * To avoid the overflow and mis-selection, shift down both.
239 */
240 sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
241 sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
242 }
243 st1 = sz1 * pi2->relative_throughput;
244 st2 = sz2 * pi1->relative_throughput;
245 if (st1 != st2)
246 return st1 - st2;
247
248 /*
249 * Case 4: Service time is equal. Choose higher throughput path.
250 */
251 return pi2->relative_throughput - pi1->relative_throughput;
252}
253
254static struct dm_path *st_select_path(struct path_selector *ps,
255 unsigned *repeat_count, size_t nr_bytes)
256{
257 struct selector *s = ps->context;
258 struct path_info *pi = NULL, *best = NULL;
259
260 if (list_empty(&s->valid_paths))
261 return NULL;
262
263 /* Change preferred (first in list) path to evenly balance. */
264 list_move_tail(s->valid_paths.next, &s->valid_paths);
265
266 list_for_each_entry(pi, &s->valid_paths, list)
267 if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
268 best = pi;
269
270 if (!best)
271 return NULL;
272
273 *repeat_count = best->repeat_count;
274
275 return best->path;
276}
277
278static int st_start_io(struct path_selector *ps, struct dm_path *path,
279 size_t nr_bytes)
280{
281 struct path_info *pi = path->pscontext;
282
283 atomic_add(nr_bytes, &pi->in_flight_size);
284
285 return 0;
286}
287
288static int st_end_io(struct path_selector *ps, struct dm_path *path,
289 size_t nr_bytes)
290{
291 struct path_info *pi = path->pscontext;
292
293 atomic_sub(nr_bytes, &pi->in_flight_size);
294
295 return 0;
296}
297
298static struct path_selector_type st_ps = {
299 .name = "service-time",
300 .module = THIS_MODULE,
301 .table_args = 2,
302 .info_args = 2,
303 .create = st_create,
304 .destroy = st_destroy,
305 .status = st_status,
306 .add_path = st_add_path,
307 .fail_path = st_fail_path,
308 .reinstate_path = st_reinstate_path,
309 .select_path = st_select_path,
310 .start_io = st_start_io,
311 .end_io = st_end_io,
312};
313
314static int __init dm_st_init(void)
315{
316 int r = dm_register_path_selector(&st_ps);
317
318 if (r < 0)
319 DMERR("register failed %d", r);
320
321 DMINFO("version " ST_VERSION " loaded");
322
323 return r;
324}
325
326static void __exit dm_st_exit(void)
327{
328 int r = dm_unregister_path_selector(&st_ps);
329
330 if (r < 0)
331 DMERR("unregister failed %d", r);
332}
333
334module_init(dm_st_init);
335module_exit(dm_st_exit);
336
337MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
338MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
339MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index e75c6dd76a9a..6e3fe4f14934 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -282,7 +282,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
282 */ 282 */
283 if (!ps->store->chunk_size) { 283 if (!ps->store->chunk_size) {
284 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, 284 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
285 bdev_hardsect_size(ps->store->cow->bdev) >> 9); 285 bdev_logical_block_size(ps->store->cow->bdev) >> 9);
286 ps->store->chunk_mask = ps->store->chunk_size - 1; 286 ps->store->chunk_mask = ps->store->chunk_size - 1;
287 ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; 287 ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
288 chunk_size_supplied = 0; 288 chunk_size_supplied = 0;
@@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
636 /* 636 /*
637 * Commit exceptions to disk. 637 * Commit exceptions to disk.
638 */ 638 */
639 if (ps->valid && area_io(ps, WRITE)) 639 if (ps->valid && area_io(ps, WRITE_BARRIER))
640 ps->valid = 0; 640 ps->valid = 0;
641 641
642 /* 642 /*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index d73f17fc7778..d573165cd2b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
678 678
679 ti->private = s; 679 ti->private = s;
680 ti->split_io = s->store->chunk_size; 680 ti->split_io = s->store->chunk_size;
681 ti->num_flush_requests = 1;
681 682
682 return 0; 683 return 0;
683 684
@@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1030 chunk_t chunk; 1031 chunk_t chunk;
1031 struct dm_snap_pending_exception *pe = NULL; 1032 struct dm_snap_pending_exception *pe = NULL;
1032 1033
1034 if (unlikely(bio_empty_barrier(bio))) {
1035 bio->bi_bdev = s->store->cow->bdev;
1036 return DM_MAPIO_REMAPPED;
1037 }
1038
1033 chunk = sector_to_chunk(s->store, bio->bi_sector); 1039 chunk = sector_to_chunk(s->store, bio->bi_sector);
1034 1040
1035 /* Full snapshots are not usable */ 1041 /* Full snapshots are not usable */
@@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1338 } 1344 }
1339 1345
1340 ti->private = dev; 1346 ti->private = dev;
1347 ti->num_flush_requests = 1;
1348
1341 return 0; 1349 return 0;
1342} 1350}
1343 1351
@@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
1353 struct dm_dev *dev = ti->private; 1361 struct dm_dev *dev = ti->private;
1354 bio->bi_bdev = dev->bdev; 1362 bio->bi_bdev = dev->bdev;
1355 1363
1364 if (unlikely(bio_empty_barrier(bio)))
1365 return DM_MAPIO_REMAPPED;
1366
1356 /* Only tell snapshots if this is a write */ 1367 /* Only tell snapshots if this is a write */
1357 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; 1368 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
1358} 1369}
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 41569bc60abc..b240e85ae39a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
167 sc->stripes = stripes; 167 sc->stripes = stripes;
168 sc->stripe_width = width; 168 sc->stripe_width = width;
169 ti->split_io = chunk_size; 169 ti->split_io = chunk_size;
170 ti->num_flush_requests = stripes;
170 171
171 sc->chunk_mask = ((sector_t) chunk_size) - 1; 172 sc->chunk_mask = ((sector_t) chunk_size) - 1;
172 for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) 173 for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
@@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
211 union map_info *map_context) 212 union map_info *map_context)
212{ 213{
213 struct stripe_c *sc = (struct stripe_c *) ti->private; 214 struct stripe_c *sc = (struct stripe_c *) ti->private;
215 sector_t offset, chunk;
216 uint32_t stripe;
214 217
215 sector_t offset = bio->bi_sector - ti->begin; 218 if (unlikely(bio_empty_barrier(bio))) {
216 sector_t chunk = offset >> sc->chunk_shift; 219 BUG_ON(map_context->flush_request >= sc->stripes);
217 uint32_t stripe = sector_div(chunk, sc->stripes); 220 bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev;
221 return DM_MAPIO_REMAPPED;
222 }
223
224 offset = bio->bi_sector - ti->begin;
225 chunk = offset >> sc->chunk_shift;
226 stripe = sector_div(chunk, sc->stripes);
218 227
219 bio->bi_bdev = sc->stripe[stripe].dev->bdev; 228 bio->bi_bdev = sc->stripe[stripe].dev->bdev;
220 bio->bi_sector = sc->stripe[stripe].physical_start + 229 bio->bi_sector = sc->stripe[stripe].physical_start +
@@ -304,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
304 return error; 313 return error;
305} 314}
306 315
316static int stripe_iterate_devices(struct dm_target *ti,
317 iterate_devices_callout_fn fn, void *data)
318{
319 struct stripe_c *sc = ti->private;
320 int ret = 0;
321 unsigned i = 0;
322
323 do
324 ret = fn(ti, sc->stripe[i].dev,
325 sc->stripe[i].physical_start, data);
326 while (!ret && ++i < sc->stripes);
327
328 return ret;
329}
330
307static struct target_type stripe_target = { 331static struct target_type stripe_target = {
308 .name = "striped", 332 .name = "striped",
309 .version = {1, 1, 0}, 333 .version = {1, 2, 0},
310 .module = THIS_MODULE, 334 .module = THIS_MODULE,
311 .ctr = stripe_ctr, 335 .ctr = stripe_ctr,
312 .dtr = stripe_dtr, 336 .dtr = stripe_dtr,
313 .map = stripe_map, 337 .map = stripe_map,
314 .end_io = stripe_end_io, 338 .end_io = stripe_end_io,
315 .status = stripe_status, 339 .status = stripe_status,
340 .iterate_devices = stripe_iterate_devices,
316}; 341};
317 342
318int __init dm_stripe_init(void) 343int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index a2a45e6c7c8b..4b045903a4e2 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
57 return strlen(buf); 57 return strlen(buf);
58} 58}
59 59
60static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
61{
62 sprintf(buf, "%d\n", dm_suspended(md));
63
64 return strlen(buf);
65}
66
60static DM_ATTR_RO(name); 67static DM_ATTR_RO(name);
61static DM_ATTR_RO(uuid); 68static DM_ATTR_RO(uuid);
69static DM_ATTR_RO(suspended);
62 70
63static struct attribute *dm_attrs[] = { 71static struct attribute *dm_attrs[] = {
64 &dm_attr_name.attr, 72 &dm_attr_name.attr,
65 &dm_attr_uuid.attr, 73 &dm_attr_uuid.attr,
74 &dm_attr_suspended.attr,
66 NULL, 75 NULL,
67}; 76};
68 77
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 429b50b975d5..4899ebe767c8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -41,6 +41,7 @@
41struct dm_table { 41struct dm_table {
42 struct mapped_device *md; 42 struct mapped_device *md;
43 atomic_t holders; 43 atomic_t holders;
44 unsigned type;
44 45
45 /* btree table */ 46 /* btree table */
46 unsigned int depth; 47 unsigned int depth;
@@ -62,15 +63,11 @@ struct dm_table {
62 /* a list of devices used by this table */ 63 /* a list of devices used by this table */
63 struct list_head devices; 64 struct list_head devices;
64 65
65 /*
66 * These are optimistic limits taken from all the
67 * targets, some targets will need smaller limits.
68 */
69 struct io_restrictions limits;
70
71 /* events get handed up using this callback */ 66 /* events get handed up using this callback */
72 void (*event_fn)(void *); 67 void (*event_fn)(void *);
73 void *event_context; 68 void *event_context;
69
70 struct dm_md_mempools *mempools;
74}; 71};
75 72
76/* 73/*
@@ -89,42 +86,6 @@ static unsigned int int_log(unsigned int n, unsigned int base)
89} 86}
90 87
91/* 88/*
92 * Returns the minimum that is _not_ zero, unless both are zero.
93 */
94#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
95
96/*
97 * Combine two io_restrictions, always taking the lower value.
98 */
99static void combine_restrictions_low(struct io_restrictions *lhs,
100 struct io_restrictions *rhs)
101{
102 lhs->max_sectors =
103 min_not_zero(lhs->max_sectors, rhs->max_sectors);
104
105 lhs->max_phys_segments =
106 min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments);
107
108 lhs->max_hw_segments =
109 min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
110
111 lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size);
112
113 lhs->max_segment_size =
114 min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
115
116 lhs->max_hw_sectors =
117 min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors);
118
119 lhs->seg_boundary_mask =
120 min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
121
122 lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn);
123
124 lhs->no_cluster |= rhs->no_cluster;
125}
126
127/*
128 * Calculate the index of the child node of the n'th node k'th key. 89 * Calculate the index of the child node of the n'th node k'th key.
129 */ 90 */
130static inline unsigned int get_child(unsigned int n, unsigned int k) 91static inline unsigned int get_child(unsigned int n, unsigned int k)
@@ -266,6 +227,8 @@ static void free_devices(struct list_head *devices)
266 list_for_each_safe(tmp, next, devices) { 227 list_for_each_safe(tmp, next, devices) {
267 struct dm_dev_internal *dd = 228 struct dm_dev_internal *dd =
268 list_entry(tmp, struct dm_dev_internal, list); 229 list_entry(tmp, struct dm_dev_internal, list);
230 DMWARN("dm_table_destroy: dm_put_device call missing for %s",
231 dd->dm_dev.name);
269 kfree(dd); 232 kfree(dd);
270 } 233 }
271} 234}
@@ -295,12 +258,10 @@ void dm_table_destroy(struct dm_table *t)
295 vfree(t->highs); 258 vfree(t->highs);
296 259
297 /* free the device list */ 260 /* free the device list */
298 if (t->devices.next != &t->devices) { 261 if (t->devices.next != &t->devices)
299 DMWARN("devices still present during destroy: "
300 "dm_table_remove_device calls missing");
301
302 free_devices(&t->devices); 262 free_devices(&t->devices);
303 } 263
264 dm_free_md_mempools(t->mempools);
304 265
305 kfree(t); 266 kfree(t);
306} 267}
@@ -384,15 +345,48 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
384/* 345/*
385 * If possible, this checks an area of a destination device is valid. 346 * If possible, this checks an area of a destination device is valid.
386 */ 347 */
387static int check_device_area(struct dm_dev_internal *dd, sector_t start, 348static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
388 sector_t len) 349 sector_t start, void *data)
389{ 350{
390 sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT; 351 struct queue_limits *limits = data;
352 struct block_device *bdev = dev->bdev;
353 sector_t dev_size =
354 i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
355 unsigned short logical_block_size_sectors =
356 limits->logical_block_size >> SECTOR_SHIFT;
357 char b[BDEVNAME_SIZE];
391 358
392 if (!dev_size) 359 if (!dev_size)
393 return 1; 360 return 1;
394 361
395 return ((start < dev_size) && (len <= (dev_size - start))); 362 if ((start >= dev_size) || (start + ti->len > dev_size)) {
363 DMWARN("%s: %s too small for target",
364 dm_device_name(ti->table->md), bdevname(bdev, b));
365 return 0;
366 }
367
368 if (logical_block_size_sectors <= 1)
369 return 1;
370
371 if (start & (logical_block_size_sectors - 1)) {
372 DMWARN("%s: start=%llu not aligned to h/w "
373 "logical block size %hu of %s",
374 dm_device_name(ti->table->md),
375 (unsigned long long)start,
376 limits->logical_block_size, bdevname(bdev, b));
377 return 0;
378 }
379
380 if (ti->len & (logical_block_size_sectors - 1)) {
381 DMWARN("%s: len=%llu not aligned to h/w "
382 "logical block size %hu of %s",
383 dm_device_name(ti->table->md),
384 (unsigned long long)ti->len,
385 limits->logical_block_size, bdevname(bdev, b));
386 return 0;
387 }
388
389 return 1;
396} 390}
397 391
398/* 392/*
@@ -478,38 +472,32 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
478 } 472 }
479 atomic_inc(&dd->count); 473 atomic_inc(&dd->count);
480 474
481 if (!check_device_area(dd, start, len)) {
482 DMWARN("device %s too small for target", path);
483 dm_put_device(ti, &dd->dm_dev);
484 return -EINVAL;
485 }
486
487 *result = &dd->dm_dev; 475 *result = &dd->dm_dev;
488
489 return 0; 476 return 0;
490} 477}
491 478
492void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) 479/*
480 * Returns the minimum that is _not_ zero, unless both are zero.
481 */
482#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
483
484int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
485 sector_t start, void *data)
493{ 486{
487 struct queue_limits *limits = data;
488 struct block_device *bdev = dev->bdev;
494 struct request_queue *q = bdev_get_queue(bdev); 489 struct request_queue *q = bdev_get_queue(bdev);
495 struct io_restrictions *rs = &ti->limits;
496 char b[BDEVNAME_SIZE]; 490 char b[BDEVNAME_SIZE];
497 491
498 if (unlikely(!q)) { 492 if (unlikely(!q)) {
499 DMWARN("%s: Cannot set limits for nonexistent device %s", 493 DMWARN("%s: Cannot set limits for nonexistent device %s",
500 dm_device_name(ti->table->md), bdevname(bdev, b)); 494 dm_device_name(ti->table->md), bdevname(bdev, b));
501 return; 495 return 0;
502 } 496 }
503 497
504 /* 498 if (blk_stack_limits(limits, &q->limits, start) < 0)
505 * Combine the device limits low. 499 DMWARN("%s: target device %s is misaligned",
506 * 500 dm_device_name(ti->table->md), bdevname(bdev, b));
507 * FIXME: if we move an io_restriction struct
508 * into q this would just be a call to
509 * combine_restrictions_low()
510 */
511 rs->max_sectors =
512 min_not_zero(rs->max_sectors, q->max_sectors);
513 501
514 /* 502 /*
515 * Check if merge fn is supported. 503 * Check if merge fn is supported.
@@ -518,47 +506,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
518 */ 506 */
519 507
520 if (q->merge_bvec_fn && !ti->type->merge) 508 if (q->merge_bvec_fn && !ti->type->merge)
521 rs->max_sectors = 509 limits->max_sectors =
522 min_not_zero(rs->max_sectors, 510 min_not_zero(limits->max_sectors,
523 (unsigned int) (PAGE_SIZE >> 9)); 511 (unsigned int) (PAGE_SIZE >> 9));
524 512 return 0;
525 rs->max_phys_segments =
526 min_not_zero(rs->max_phys_segments,
527 q->max_phys_segments);
528
529 rs->max_hw_segments =
530 min_not_zero(rs->max_hw_segments, q->max_hw_segments);
531
532 rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
533
534 rs->max_segment_size =
535 min_not_zero(rs->max_segment_size, q->max_segment_size);
536
537 rs->max_hw_sectors =
538 min_not_zero(rs->max_hw_sectors, q->max_hw_sectors);
539
540 rs->seg_boundary_mask =
541 min_not_zero(rs->seg_boundary_mask,
542 q->seg_boundary_mask);
543
544 rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn);
545
546 rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
547} 513}
548EXPORT_SYMBOL_GPL(dm_set_device_limits); 514EXPORT_SYMBOL_GPL(dm_set_device_limits);
549 515
550int dm_get_device(struct dm_target *ti, const char *path, sector_t start, 516int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
551 sector_t len, fmode_t mode, struct dm_dev **result) 517 sector_t len, fmode_t mode, struct dm_dev **result)
552{ 518{
553 int r = __table_get_device(ti->table, ti, path, 519 return __table_get_device(ti->table, ti, path,
554 start, len, mode, result); 520 start, len, mode, result);
555
556 if (!r)
557 dm_set_device_limits(ti, (*result)->bdev);
558
559 return r;
560} 521}
561 522
523
562/* 524/*
563 * Decrement a devices use count and remove it if necessary. 525 * Decrement a devices use count and remove it if necessary.
564 */ 526 */
@@ -673,24 +635,78 @@ int dm_split_args(int *argc, char ***argvp, char *input)
673 return 0; 635 return 0;
674} 636}
675 637
676static void check_for_valid_limits(struct io_restrictions *rs) 638/*
639 * Impose necessary and sufficient conditions on a devices's table such
640 * that any incoming bio which respects its logical_block_size can be
641 * processed successfully. If it falls across the boundary between
642 * two or more targets, the size of each piece it gets split into must
643 * be compatible with the logical_block_size of the target processing it.
644 */
645static int validate_hardware_logical_block_alignment(struct dm_table *table,
646 struct queue_limits *limits)
677{ 647{
678 if (!rs->max_sectors) 648 /*
679 rs->max_sectors = SAFE_MAX_SECTORS; 649 * This function uses arithmetic modulo the logical_block_size
680 if (!rs->max_hw_sectors) 650 * (in units of 512-byte sectors).
681 rs->max_hw_sectors = SAFE_MAX_SECTORS; 651 */
682 if (!rs->max_phys_segments) 652 unsigned short device_logical_block_size_sects =
683 rs->max_phys_segments = MAX_PHYS_SEGMENTS; 653 limits->logical_block_size >> SECTOR_SHIFT;
684 if (!rs->max_hw_segments) 654
685 rs->max_hw_segments = MAX_HW_SEGMENTS; 655 /*
686 if (!rs->hardsect_size) 656 * Offset of the start of the next table entry, mod logical_block_size.
687 rs->hardsect_size = 1 << SECTOR_SHIFT; 657 */
688 if (!rs->max_segment_size) 658 unsigned short next_target_start = 0;
689 rs->max_segment_size = MAX_SEGMENT_SIZE; 659
690 if (!rs->seg_boundary_mask) 660 /*
691 rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 661 * Given an aligned bio that extends beyond the end of a
692 if (!rs->bounce_pfn) 662 * target, how many sectors must the next target handle?
693 rs->bounce_pfn = -1; 663 */
664 unsigned short remaining = 0;
665
666 struct dm_target *uninitialized_var(ti);
667 struct queue_limits ti_limits;
668 unsigned i = 0;
669
670 /*
671 * Check each entry in the table in turn.
672 */
673 while (i < dm_table_get_num_targets(table)) {
674 ti = dm_table_get_target(table, i++);
675
676 blk_set_default_limits(&ti_limits);
677
678 /* combine all target devices' limits */
679 if (ti->type->iterate_devices)
680 ti->type->iterate_devices(ti, dm_set_device_limits,
681 &ti_limits);
682
683 /*
684 * If the remaining sectors fall entirely within this
685 * table entry are they compatible with its logical_block_size?
686 */
687 if (remaining < ti->len &&
688 remaining & ((ti_limits.logical_block_size >>
689 SECTOR_SHIFT) - 1))
690 break; /* Error */
691
692 next_target_start =
693 (unsigned short) ((next_target_start + ti->len) &
694 (device_logical_block_size_sects - 1));
695 remaining = next_target_start ?
696 device_logical_block_size_sects - next_target_start : 0;
697 }
698
699 if (remaining) {
700 DMWARN("%s: table line %u (start sect %llu len %llu) "
701 "not aligned to h/w logical block size %hu",
702 dm_device_name(table->md), i,
703 (unsigned long long) ti->begin,
704 (unsigned long long) ti->len,
705 limits->logical_block_size);
706 return -EINVAL;
707 }
708
709 return 0;
694} 710}
695 711
696int dm_table_add_target(struct dm_table *t, const char *type, 712int dm_table_add_target(struct dm_table *t, const char *type,
@@ -745,9 +761,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
745 761
746 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; 762 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
747 763
748 /* FIXME: the plan is to combine high here and then have
749 * the merge fn apply the target level restrictions. */
750 combine_restrictions_low(&t->limits, &tgt->limits);
751 return 0; 764 return 0;
752 765
753 bad: 766 bad:
@@ -756,6 +769,104 @@ int dm_table_add_target(struct dm_table *t, const char *type,
756 return r; 769 return r;
757} 770}
758 771
772int dm_table_set_type(struct dm_table *t)
773{
774 unsigned i;
775 unsigned bio_based = 0, request_based = 0;
776 struct dm_target *tgt;
777 struct dm_dev_internal *dd;
778 struct list_head *devices;
779
780 for (i = 0; i < t->num_targets; i++) {
781 tgt = t->targets + i;
782 if (dm_target_request_based(tgt))
783 request_based = 1;
784 else
785 bio_based = 1;
786
787 if (bio_based && request_based) {
788 DMWARN("Inconsistent table: different target types"
789 " can't be mixed up");
790 return -EINVAL;
791 }
792 }
793
794 if (bio_based) {
795 /* We must use this table as bio-based */
796 t->type = DM_TYPE_BIO_BASED;
797 return 0;
798 }
799
800 BUG_ON(!request_based); /* No targets in this table */
801
802 /* Non-request-stackable devices can't be used for request-based dm */
803 devices = dm_table_get_devices(t);
804 list_for_each_entry(dd, devices, list) {
805 if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) {
806 DMWARN("table load rejected: including"
807 " non-request-stackable devices");
808 return -EINVAL;
809 }
810 }
811
812 /*
813 * Request-based dm supports only tables that have a single target now.
814 * To support multiple targets, request splitting support is needed,
815 * and that needs lots of changes in the block-layer.
816 * (e.g. request completion process for partial completion.)
817 */
818 if (t->num_targets > 1) {
819 DMWARN("Request-based dm doesn't support multiple targets yet");
820 return -EINVAL;
821 }
822
823 t->type = DM_TYPE_REQUEST_BASED;
824
825 return 0;
826}
827
828unsigned dm_table_get_type(struct dm_table *t)
829{
830 return t->type;
831}
832
833bool dm_table_bio_based(struct dm_table *t)
834{
835 return dm_table_get_type(t) == DM_TYPE_BIO_BASED;
836}
837
838bool dm_table_request_based(struct dm_table *t)
839{
840 return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
841}
842
843int dm_table_alloc_md_mempools(struct dm_table *t)
844{
845 unsigned type = dm_table_get_type(t);
846
847 if (unlikely(type == DM_TYPE_NONE)) {
848 DMWARN("no table type is set, can't allocate mempools");
849 return -EINVAL;
850 }
851
852 t->mempools = dm_alloc_md_mempools(type);
853 if (!t->mempools)
854 return -ENOMEM;
855
856 return 0;
857}
858
859void dm_table_free_md_mempools(struct dm_table *t)
860{
861 dm_free_md_mempools(t->mempools);
862 t->mempools = NULL;
863}
864
865struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t)
866{
867 return t->mempools;
868}
869
759static int setup_indexes(struct dm_table *t) 870static int setup_indexes(struct dm_table *t)
760{ 871{
761 int i; 872 int i;
@@ -790,8 +901,6 @@ int dm_table_complete(struct dm_table *t)
790 int r = 0; 901 int r = 0;
791 unsigned int leaf_nodes; 902 unsigned int leaf_nodes;
792 903
793 check_for_valid_limits(&t->limits);
794
795 /* how many indexes will the btree have ? */ 904 /* how many indexes will the btree have ? */
796 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); 905 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
797 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); 906 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -867,6 +976,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
867} 976}
868 977
869/* 978/*
979 * Establish the new table's queue_limits and validate them.
980 */
981int dm_calculate_queue_limits(struct dm_table *table,
982 struct queue_limits *limits)
983{
984 struct dm_target *uninitialized_var(ti);
985 struct queue_limits ti_limits;
986 unsigned i = 0;
987
988 blk_set_default_limits(limits);
989
990 while (i < dm_table_get_num_targets(table)) {
991 blk_set_default_limits(&ti_limits);
992
993 ti = dm_table_get_target(table, i++);
994
995 if (!ti->type->iterate_devices)
996 goto combine_limits;
997
998 /*
999 * Combine queue limits of all the devices this target uses.
1000 */
1001 ti->type->iterate_devices(ti, dm_set_device_limits,
1002 &ti_limits);
1003
1004 /*
1005 * Check each device area is consistent with the target's
1006 * overall queue limits.
1007 */
1008 if (!ti->type->iterate_devices(ti, device_area_is_valid,
1009 &ti_limits))
1010 return -EINVAL;
1011
1012combine_limits:
1013 /*
1014 * Merge this target's queue limits into the overall limits
1015 * for the table.
1016 */
1017 if (blk_stack_limits(limits, &ti_limits, 0) < 0)
1018 DMWARN("%s: target device "
1019 "(start sect %llu len %llu) "
1020 "is misaligned",
1021 dm_device_name(table->md),
1022 (unsigned long long) ti->begin,
1023 (unsigned long long) ti->len);
1024 }
1025
1026 return validate_hardware_logical_block_alignment(table, limits);
1027}
1028
1029/*
870 * Set the integrity profile for this device if all devices used have 1030 * Set the integrity profile for this device if all devices used have
871 * matching profiles. 1031 * matching profiles.
872 */ 1032 */
@@ -905,27 +1065,42 @@ no_integrity:
905 return; 1065 return;
906} 1066}
907 1067
908void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) 1068void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1069 struct queue_limits *limits)
909{ 1070{
910 /* 1071 /*
911 * Make sure we obey the optimistic sub devices 1072 * Each target device in the table has a data area that should normally
912 * restrictions. 1073 * be aligned such that the DM device's alignment_offset is 0.
1074 * FIXME: Propagate alignment_offsets up the stack and warn of
1075 * sub-optimal or inconsistent settings.
1076 */
1077 limits->alignment_offset = 0;
1078 limits->misaligned = 0;
1079
1080 /*
1081 * Copy table's limits to the DM device's request_queue
913 */ 1082 */
914 blk_queue_max_sectors(q, t->limits.max_sectors); 1083 q->limits = *limits;
915 q->max_phys_segments = t->limits.max_phys_segments; 1084
916 q->max_hw_segments = t->limits.max_hw_segments; 1085 if (limits->no_cluster)
917 q->hardsect_size = t->limits.hardsect_size;
918 q->max_segment_size = t->limits.max_segment_size;
919 q->max_hw_sectors = t->limits.max_hw_sectors;
920 q->seg_boundary_mask = t->limits.seg_boundary_mask;
921 q->bounce_pfn = t->limits.bounce_pfn;
922
923 if (t->limits.no_cluster)
924 queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); 1086 queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
925 else 1087 else
926 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); 1088 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
927 1089
928 dm_table_set_integrity(t); 1090 dm_table_set_integrity(t);
1091
1092 /*
1093 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
1094 * visible to other CPUs because, once the flag is set, incoming bios
1095 * are processed by request-based dm, which refers to the queue
1096 * settings.
1097 * Until the flag set, bios are passed to bio-based dm and queued to
1098 * md->deferred where queue settings are not needed yet.
1099 * Those bios are passed to request-based dm at the resume time.
1100 */
1101 smp_mb();
1102 if (dm_table_request_based(t))
1103 queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
929} 1104}
930 1105
931unsigned int dm_table_get_num_targets(struct dm_table *t) 1106unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -1021,6 +1196,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
1021 return r; 1196 return r;
1022} 1197}
1023 1198
1199int dm_table_any_busy_target(struct dm_table *t)
1200{
1201 unsigned i;
1202 struct dm_target *ti;
1203
1204 for (i = 0; i < t->num_targets; i++) {
1205 ti = t->targets + i;
1206 if (ti->type->busy && ti->type->busy(ti))
1207 return 1;
1208 }
1209
1210 return 0;
1211}
1212
1024void dm_table_unplug_all(struct dm_table *t) 1213void dm_table_unplug_all(struct dm_table *t)
1025{ 1214{
1026 struct dm_dev_internal *dd; 1215 struct dm_dev_internal *dd;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 424f7b048c30..3c6d4ee8921d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,11 +19,18 @@
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/idr.h> 20#include <linux/idr.h>
21#include <linux/hdreg.h> 21#include <linux/hdreg.h>
22#include <linux/blktrace_api.h> 22
23#include <trace/block.h> 23#include <trace/events/block.h>
24 24
25#define DM_MSG_PREFIX "core" 25#define DM_MSG_PREFIX "core"
26 26
27/*
28 * Cookies are numeric values sent with CHANGE and REMOVE
29 * uevents while resuming, removing or renaming the device.
30 */
31#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
32#define DM_COOKIE_LENGTH 24
33
27static const char *_name = DM_NAME; 34static const char *_name = DM_NAME;
28 35
29static unsigned int major = 0; 36static unsigned int major = 0;
@@ -53,8 +60,6 @@ struct dm_target_io {
53 union map_info info; 60 union map_info info;
54}; 61};
55 62
56DEFINE_TRACE(block_bio_complete);
57
58/* 63/*
59 * For request-based dm. 64 * For request-based dm.
60 * One of these is allocated per request. 65 * One of these is allocated per request.
@@ -73,7 +78,7 @@ struct dm_rq_target_io {
73 */ 78 */
74struct dm_rq_clone_bio_info { 79struct dm_rq_clone_bio_info {
75 struct bio *orig; 80 struct bio *orig;
76 struct request *rq; 81 struct dm_rq_target_io *tio;
77}; 82};
78 83
79union map_info *dm_get_mapinfo(struct bio *bio) 84union map_info *dm_get_mapinfo(struct bio *bio)
@@ -83,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio)
83 return NULL; 88 return NULL;
84} 89}
85 90
91union map_info *dm_get_rq_mapinfo(struct request *rq)
92{
93 if (rq && rq->end_io_data)
94 return &((struct dm_rq_target_io *)rq->end_io_data)->info;
95 return NULL;
96}
97EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
98
86#define MINOR_ALLOCED ((void *)-1) 99#define MINOR_ALLOCED ((void *)-1)
87 100
88/* 101/*
@@ -159,13 +172,31 @@ struct mapped_device {
159 * freeze/thaw support require holding onto a super block 172 * freeze/thaw support require holding onto a super block
160 */ 173 */
161 struct super_block *frozen_sb; 174 struct super_block *frozen_sb;
162 struct block_device *suspended_bdev; 175 struct block_device *bdev;
163 176
164 /* forced geometry settings */ 177 /* forced geometry settings */
165 struct hd_geometry geometry; 178 struct hd_geometry geometry;
166 179
180 /* marker of flush suspend for request-based dm */
181 struct request suspend_rq;
182
183 /* For saving the address of __make_request for request based dm */
184 make_request_fn *saved_make_request_fn;
185
167 /* sysfs handle */ 186 /* sysfs handle */
168 struct kobject kobj; 187 struct kobject kobj;
188
189 /* zero-length barrier that will be cloned and submitted to targets */
190 struct bio barrier_bio;
191};
192
193/*
194 * For mempools pre-allocation at the table loading time.
195 */
196struct dm_md_mempools {
197 mempool_t *io_pool;
198 mempool_t *tio_pool;
199 struct bio_set *bs;
169}; 200};
170 201
171#define MIN_IOS 256 202#define MIN_IOS 256
@@ -393,14 +424,29 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
393 mempool_free(io, md->io_pool); 424 mempool_free(io, md->io_pool);
394} 425}
395 426
396static struct dm_target_io *alloc_tio(struct mapped_device *md) 427static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
428{
429 mempool_free(tio, md->tio_pool);
430}
431
432static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
397{ 433{
398 return mempool_alloc(md->tio_pool, GFP_NOIO); 434 return mempool_alloc(md->tio_pool, GFP_ATOMIC);
399} 435}
400 436
401static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 437static void free_rq_tio(struct dm_rq_target_io *tio)
402{ 438{
403 mempool_free(tio, md->tio_pool); 439 mempool_free(tio, tio->md->tio_pool);
440}
441
442static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
443{
444 return mempool_alloc(md->io_pool, GFP_ATOMIC);
445}
446
447static void free_bio_info(struct dm_rq_clone_bio_info *info)
448{
449 mempool_free(info, info->tio->md->io_pool);
404} 450}
405 451
406static void start_io_acct(struct dm_io *io) 452static void start_io_acct(struct dm_io *io)
@@ -466,12 +512,13 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
466struct dm_table *dm_get_table(struct mapped_device *md) 512struct dm_table *dm_get_table(struct mapped_device *md)
467{ 513{
468 struct dm_table *t; 514 struct dm_table *t;
515 unsigned long flags;
469 516
470 read_lock(&md->map_lock); 517 read_lock_irqsave(&md->map_lock, flags);
471 t = md->map; 518 t = md->map;
472 if (t) 519 if (t)
473 dm_table_get(t); 520 dm_table_get(t);
474 read_unlock(&md->map_lock); 521 read_unlock_irqrestore(&md->map_lock, flags);
475 522
476 return t; 523 return t;
477} 524}
@@ -538,9 +585,11 @@ static void dec_pending(struct dm_io *io, int error)
538 * Target requested pushing back the I/O. 585 * Target requested pushing back the I/O.
539 */ 586 */
540 spin_lock_irqsave(&md->deferred_lock, flags); 587 spin_lock_irqsave(&md->deferred_lock, flags);
541 if (__noflush_suspending(md)) 588 if (__noflush_suspending(md)) {
542 bio_list_add_head(&md->deferred, io->bio); 589 if (!bio_barrier(io->bio))
543 else 590 bio_list_add_head(&md->deferred,
591 io->bio);
592 } else
544 /* noflush suspend was interrupted. */ 593 /* noflush suspend was interrupted. */
545 io->error = -EIO; 594 io->error = -EIO;
546 spin_unlock_irqrestore(&md->deferred_lock, flags); 595 spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -555,7 +604,8 @@ static void dec_pending(struct dm_io *io, int error)
555 * a per-device variable for error reporting. 604 * a per-device variable for error reporting.
556 * Note that you can't touch the bio after end_io_acct 605 * Note that you can't touch the bio after end_io_acct
557 */ 606 */
558 md->barrier_error = io_error; 607 if (!md->barrier_error && io_error != -EOPNOTSUPP)
608 md->barrier_error = io_error;
559 end_io_acct(io); 609 end_io_acct(io);
560 } else { 610 } else {
561 end_io_acct(io); 611 end_io_acct(io);
@@ -609,6 +659,262 @@ static void clone_endio(struct bio *bio, int error)
609 dec_pending(io, error); 659 dec_pending(io, error);
610} 660}
611 661
662/*
663 * Partial completion handling for request-based dm
664 */
665static void end_clone_bio(struct bio *clone, int error)
666{
667 struct dm_rq_clone_bio_info *info = clone->bi_private;
668 struct dm_rq_target_io *tio = info->tio;
669 struct bio *bio = info->orig;
670 unsigned int nr_bytes = info->orig->bi_size;
671
672 bio_put(clone);
673
674 if (tio->error)
675 /*
676 * An error has already been detected on the request.
677 * Once error occurred, just let clone->end_io() handle
678 * the remainder.
679 */
680 return;
681 else if (error) {
682 /*
683 * Don't notice the error to the upper layer yet.
684 * The error handling decision is made by the target driver,
685 * when the request is completed.
686 */
687 tio->error = error;
688 return;
689 }
690
691 /*
692 * I/O for the bio successfully completed.
693 * Notice the data completion to the upper layer.
694 */
695
696 /*
697 * bios are processed from the head of the list.
698 * So the completing bio should always be rq->bio.
699 * If it's not, something wrong is happening.
700 */
701 if (tio->orig->bio != bio)
702 DMERR("bio completion is going in the middle of the request");
703
704 /*
705 * Update the original request.
706 * Do not use blk_end_request() here, because it may complete
707 * the original request before the clone, and break the ordering.
708 */
709 blk_update_request(tio->orig, 0, nr_bytes);
710}
711
712/*
713 * Don't touch any member of the md after calling this function because
714 * the md may be freed in dm_put() at the end of this function.
715 * Or do dm_get() before calling this function and dm_put() later.
716 */
717static void rq_completed(struct mapped_device *md, int run_queue)
718{
719 int wakeup_waiters = 0;
720 struct request_queue *q = md->queue;
721 unsigned long flags;
722
723 spin_lock_irqsave(q->queue_lock, flags);
724 if (!queue_in_flight(q))
725 wakeup_waiters = 1;
726 spin_unlock_irqrestore(q->queue_lock, flags);
727
728 /* nudge anyone waiting on suspend queue */
729 if (wakeup_waiters)
730 wake_up(&md->wait);
731
732 if (run_queue)
733 blk_run_queue(q);
734
735 /*
736 * dm_put() must be at the end of this function. See the comment above
737 */
738 dm_put(md);
739}
740
741static void dm_unprep_request(struct request *rq)
742{
743 struct request *clone = rq->special;
744 struct dm_rq_target_io *tio = clone->end_io_data;
745
746 rq->special = NULL;
747 rq->cmd_flags &= ~REQ_DONTPREP;
748
749 blk_rq_unprep_clone(clone);
750 free_rq_tio(tio);
751}
752
753/*
754 * Requeue the original request of a clone.
755 */
756void dm_requeue_unmapped_request(struct request *clone)
757{
758 struct dm_rq_target_io *tio = clone->end_io_data;
759 struct mapped_device *md = tio->md;
760 struct request *rq = tio->orig;
761 struct request_queue *q = rq->q;
762 unsigned long flags;
763
764 dm_unprep_request(rq);
765
766 spin_lock_irqsave(q->queue_lock, flags);
767 if (elv_queue_empty(q))
768 blk_plug_device(q);
769 blk_requeue_request(q, rq);
770 spin_unlock_irqrestore(q->queue_lock, flags);
771
772 rq_completed(md, 0);
773}
774EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
775
776static void __stop_queue(struct request_queue *q)
777{
778 blk_stop_queue(q);
779}
780
781static void stop_queue(struct request_queue *q)
782{
783 unsigned long flags;
784
785 spin_lock_irqsave(q->queue_lock, flags);
786 __stop_queue(q);
787 spin_unlock_irqrestore(q->queue_lock, flags);
788}
789
790static void __start_queue(struct request_queue *q)
791{
792 if (blk_queue_stopped(q))
793 blk_start_queue(q);
794}
795
796static void start_queue(struct request_queue *q)
797{
798 unsigned long flags;
799
800 spin_lock_irqsave(q->queue_lock, flags);
801 __start_queue(q);
802 spin_unlock_irqrestore(q->queue_lock, flags);
803}
804
805/*
806 * Complete the clone and the original request.
807 * Must be called without queue lock.
808 */
809static void dm_end_request(struct request *clone, int error)
810{
811 struct dm_rq_target_io *tio = clone->end_io_data;
812 struct mapped_device *md = tio->md;
813 struct request *rq = tio->orig;
814
815 if (blk_pc_request(rq)) {
816 rq->errors = clone->errors;
817 rq->resid_len = clone->resid_len;
818
819 if (rq->sense)
820 /*
821 * We are using the sense buffer of the original
822 * request.
823 * So setting the length of the sense data is enough.
824 */
825 rq->sense_len = clone->sense_len;
826 }
827
828 BUG_ON(clone->bio);
829 free_rq_tio(tio);
830
831 blk_end_request_all(rq, error);
832
833 rq_completed(md, 1);
834}
835
836/*
837 * Request completion handler for request-based dm
838 */
839static void dm_softirq_done(struct request *rq)
840{
841 struct request *clone = rq->completion_data;
842 struct dm_rq_target_io *tio = clone->end_io_data;
843 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
844 int error = tio->error;
845
846 if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io)
847 error = rq_end_io(tio->ti, clone, error, &tio->info);
848
849 if (error <= 0)
850 /* The target wants to complete the I/O */
851 dm_end_request(clone, error);
852 else if (error == DM_ENDIO_INCOMPLETE)
853 /* The target will handle the I/O */
854 return;
855 else if (error == DM_ENDIO_REQUEUE)
856 /* The target wants to requeue the I/O */
857 dm_requeue_unmapped_request(clone);
858 else {
859 DMWARN("unimplemented target endio return value: %d", error);
860 BUG();
861 }
862}
863
864/*
865 * Complete the clone and the original request with the error status
866 * through softirq context.
867 */
868static void dm_complete_request(struct request *clone, int error)
869{
870 struct dm_rq_target_io *tio = clone->end_io_data;
871 struct request *rq = tio->orig;
872
873 tio->error = error;
874 rq->completion_data = clone;
875 blk_complete_request(rq);
876}
877
878/*
879 * Complete the not-mapped clone and the original request with the error status
880 * through softirq context.
881 * Target's rq_end_io() function isn't called.
882 * This may be used when the target's map_rq() function fails.
883 */
884void dm_kill_unmapped_request(struct request *clone, int error)
885{
886 struct dm_rq_target_io *tio = clone->end_io_data;
887 struct request *rq = tio->orig;
888
889 rq->cmd_flags |= REQ_FAILED;
890 dm_complete_request(clone, error);
891}
892EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
893
894/*
895 * Called with the queue lock held
896 */
897static void end_clone_request(struct request *clone, int error)
898{
899 /*
900 * For just cleaning up the information of the queue in which
901 * the clone was dispatched.
902 * The clone is *NOT* freed actually here because it is alloced from
903 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
904 */
905 __blk_put_request(clone->q, clone);
906
907 /*
908 * Actual request completion is done in a softirq context which doesn't
909 * hold the queue lock. Otherwise, deadlock could occur because:
910 * - another request may be submitted by the upper level driver
911 * of the stacking during the completion
912 * - the submission which requires queue lock may be done
913 * against this queue
914 */
915 dm_complete_request(clone, error);
916}
917
612static sector_t max_io_len(struct mapped_device *md, 918static sector_t max_io_len(struct mapped_device *md,
613 sector_t sector, struct dm_target *ti) 919 sector_t sector, struct dm_target *ti)
614{ 920{
@@ -636,11 +942,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
636 sector_t sector; 942 sector_t sector;
637 struct mapped_device *md; 943 struct mapped_device *md;
638 944
639 /*
640 * Sanity checks.
641 */
642 BUG_ON(!clone->bi_size);
643
644 clone->bi_end_io = clone_endio; 945 clone->bi_end_io = clone_endio;
645 clone->bi_private = tio; 946 clone->bi_private = tio;
646 947
@@ -656,8 +957,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
656 /* the bio has been remapped so dispatch it */ 957 /* the bio has been remapped so dispatch it */
657 958
658 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, 959 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
659 tio->io->bio->bi_bdev->bd_dev, 960 tio->io->bio->bi_bdev->bd_dev, sector);
660 clone->bi_sector, sector);
661 961
662 generic_make_request(clone); 962 generic_make_request(clone);
663 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 963 } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
@@ -755,6 +1055,48 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
755 return clone; 1055 return clone;
756} 1056}
757 1057
1058static struct dm_target_io *alloc_tio(struct clone_info *ci,
1059 struct dm_target *ti)
1060{
1061 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
1062
1063 tio->io = ci->io;
1064 tio->ti = ti;
1065 memset(&tio->info, 0, sizeof(tio->info));
1066
1067 return tio;
1068}
1069
1070static void __flush_target(struct clone_info *ci, struct dm_target *ti,
1071 unsigned flush_nr)
1072{
1073 struct dm_target_io *tio = alloc_tio(ci, ti);
1074 struct bio *clone;
1075
1076 tio->info.flush_request = flush_nr;
1077
1078 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
1079 __bio_clone(clone, ci->bio);
1080 clone->bi_destructor = dm_bio_destructor;
1081
1082 __map_bio(ti, clone, tio);
1083}
1084
1085static int __clone_and_map_empty_barrier(struct clone_info *ci)
1086{
1087 unsigned target_nr = 0, flush_nr;
1088 struct dm_target *ti;
1089
1090 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1091 for (flush_nr = 0; flush_nr < ti->num_flush_requests;
1092 flush_nr++)
1093 __flush_target(ci, ti, flush_nr);
1094
1095 ci->sector_count = 0;
1096
1097 return 0;
1098}
1099
758static int __clone_and_map(struct clone_info *ci) 1100static int __clone_and_map(struct clone_info *ci)
759{ 1101{
760 struct bio *clone, *bio = ci->bio; 1102 struct bio *clone, *bio = ci->bio;
@@ -762,6 +1104,9 @@ static int __clone_and_map(struct clone_info *ci)
762 sector_t len = 0, max; 1104 sector_t len = 0, max;
763 struct dm_target_io *tio; 1105 struct dm_target_io *tio;
764 1106
1107 if (unlikely(bio_empty_barrier(bio)))
1108 return __clone_and_map_empty_barrier(ci);
1109
765 ti = dm_table_find_target(ci->map, ci->sector); 1110 ti = dm_table_find_target(ci->map, ci->sector);
766 if (!dm_target_is_valid(ti)) 1111 if (!dm_target_is_valid(ti))
767 return -EIO; 1112 return -EIO;
@@ -771,10 +1116,7 @@ static int __clone_and_map(struct clone_info *ci)
771 /* 1116 /*
772 * Allocate a target io object. 1117 * Allocate a target io object.
773 */ 1118 */
774 tio = alloc_tio(ci->md); 1119 tio = alloc_tio(ci, ti);
775 tio->io = ci->io;
776 tio->ti = ti;
777 memset(&tio->info, 0, sizeof(tio->info));
778 1120
779 if (ci->sector_count <= max) { 1121 if (ci->sector_count <= max) {
780 /* 1122 /*
@@ -830,10 +1172,7 @@ static int __clone_and_map(struct clone_info *ci)
830 1172
831 max = max_io_len(ci->md, ci->sector, ti); 1173 max = max_io_len(ci->md, ci->sector, ti);
832 1174
833 tio = alloc_tio(ci->md); 1175 tio = alloc_tio(ci, ti);
834 tio->io = ci->io;
835 tio->ti = ti;
836 memset(&tio->info, 0, sizeof(tio->info));
837 } 1176 }
838 1177
839 len = min(remaining, max); 1178 len = min(remaining, max);
@@ -868,7 +1207,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
868 if (!bio_barrier(bio)) 1207 if (!bio_barrier(bio))
869 bio_io_error(bio); 1208 bio_io_error(bio);
870 else 1209 else
871 md->barrier_error = -EIO; 1210 if (!md->barrier_error)
1211 md->barrier_error = -EIO;
872 return; 1212 return;
873 } 1213 }
874 1214
@@ -881,6 +1221,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
881 ci.io->md = md; 1221 ci.io->md = md;
882 ci.sector = bio->bi_sector; 1222 ci.sector = bio->bi_sector;
883 ci.sector_count = bio_sectors(bio); 1223 ci.sector_count = bio_sectors(bio);
1224 if (unlikely(bio_empty_barrier(bio)))
1225 ci.sector_count = 1;
884 ci.idx = bio->bi_idx; 1226 ci.idx = bio->bi_idx;
885 1227
886 start_io_acct(ci.io); 1228 start_io_acct(ci.io);
@@ -928,6 +1270,16 @@ static int dm_merge_bvec(struct request_queue *q,
928 */ 1270 */
929 if (max_size && ti->type->merge) 1271 if (max_size && ti->type->merge)
930 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1272 max_size = ti->type->merge(ti, bvm, biovec, max_size);
1273 /*
1274 * If the target doesn't support merge method and some of the devices
1275 * provided their merge_bvec method (we know this by looking at
1276 * queue_max_hw_sectors), then we can't allow bios with multiple vector
1277 * entries. So always set max_size to 0, and the code below allows
1278 * just one page.
1279 */
1280 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1281
1282 max_size = 0;
931 1283
932out_table: 1284out_table:
933 dm_table_put(map); 1285 dm_table_put(map);
@@ -946,7 +1298,7 @@ out:
946 * The request function that just remaps the bio built up by 1298 * The request function that just remaps the bio built up by
947 * dm_merge_bvec. 1299 * dm_merge_bvec.
948 */ 1300 */
949static int dm_request(struct request_queue *q, struct bio *bio) 1301static int _dm_request(struct request_queue *q, struct bio *bio)
950{ 1302{
951 int rw = bio_data_dir(bio); 1303 int rw = bio_data_dir(bio);
952 struct mapped_device *md = q->queuedata; 1304 struct mapped_device *md = q->queuedata;
@@ -983,12 +1335,274 @@ static int dm_request(struct request_queue *q, struct bio *bio)
983 return 0; 1335 return 0;
984} 1336}
985 1337
1338static int dm_make_request(struct request_queue *q, struct bio *bio)
1339{
1340 struct mapped_device *md = q->queuedata;
1341
1342 if (unlikely(bio_barrier(bio))) {
1343 bio_endio(bio, -EOPNOTSUPP);
1344 return 0;
1345 }
1346
1347 return md->saved_make_request_fn(q, bio); /* call __make_request() */
1348}
1349
1350static int dm_request_based(struct mapped_device *md)
1351{
1352 return blk_queue_stackable(md->queue);
1353}
1354
1355static int dm_request(struct request_queue *q, struct bio *bio)
1356{
1357 struct mapped_device *md = q->queuedata;
1358
1359 if (dm_request_based(md))
1360 return dm_make_request(q, bio);
1361
1362 return _dm_request(q, bio);
1363}
1364
1365void dm_dispatch_request(struct request *rq)
1366{
1367 int r;
1368
1369 if (blk_queue_io_stat(rq->q))
1370 rq->cmd_flags |= REQ_IO_STAT;
1371
1372 rq->start_time = jiffies;
1373 r = blk_insert_cloned_request(rq->q, rq);
1374 if (r)
1375 dm_complete_request(rq, r);
1376}
1377EXPORT_SYMBOL_GPL(dm_dispatch_request);
1378
1379static void dm_rq_bio_destructor(struct bio *bio)
1380{
1381 struct dm_rq_clone_bio_info *info = bio->bi_private;
1382 struct mapped_device *md = info->tio->md;
1383
1384 free_bio_info(info);
1385 bio_free(bio, md->bs);
1386}
1387
1388static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1389 void *data)
1390{
1391 struct dm_rq_target_io *tio = data;
1392 struct mapped_device *md = tio->md;
1393 struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
1394
1395 if (!info)
1396 return -ENOMEM;
1397
1398 info->orig = bio_orig;
1399 info->tio = tio;
1400 bio->bi_end_io = end_clone_bio;
1401 bio->bi_private = info;
1402 bio->bi_destructor = dm_rq_bio_destructor;
1403
1404 return 0;
1405}
1406
1407static int setup_clone(struct request *clone, struct request *rq,
1408 struct dm_rq_target_io *tio)
1409{
1410 int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1411 dm_rq_bio_constructor, tio);
1412
1413 if (r)
1414 return r;
1415
1416 clone->cmd = rq->cmd;
1417 clone->cmd_len = rq->cmd_len;
1418 clone->sense = rq->sense;
1419 clone->buffer = rq->buffer;
1420 clone->end_io = end_clone_request;
1421 clone->end_io_data = tio;
1422
1423 return 0;
1424}
1425
1426static int dm_rq_flush_suspending(struct mapped_device *md)
1427{
1428 return !md->suspend_rq.special;
1429}
1430
1431/*
1432 * Called with the queue lock held.
1433 */
1434static int dm_prep_fn(struct request_queue *q, struct request *rq)
1435{
1436 struct mapped_device *md = q->queuedata;
1437 struct dm_rq_target_io *tio;
1438 struct request *clone;
1439
1440 if (unlikely(rq == &md->suspend_rq)) {
1441 if (dm_rq_flush_suspending(md))
1442 return BLKPREP_OK;
1443 else
1444 /* The flush suspend was interrupted */
1445 return BLKPREP_KILL;
1446 }
1447
1448 if (unlikely(rq->special)) {
1449 DMWARN("Already has something in rq->special.");
1450 return BLKPREP_KILL;
1451 }
1452
1453 tio = alloc_rq_tio(md); /* Only one for each original request */
1454 if (!tio)
1455 /* -ENOMEM */
1456 return BLKPREP_DEFER;
1457
1458 tio->md = md;
1459 tio->ti = NULL;
1460 tio->orig = rq;
1461 tio->error = 0;
1462 memset(&tio->info, 0, sizeof(tio->info));
1463
1464 clone = &tio->clone;
1465 if (setup_clone(clone, rq, tio)) {
1466 /* -ENOMEM */
1467 free_rq_tio(tio);
1468 return BLKPREP_DEFER;
1469 }
1470
1471 rq->special = clone;
1472 rq->cmd_flags |= REQ_DONTPREP;
1473
1474 return BLKPREP_OK;
1475}
1476
1477static void map_request(struct dm_target *ti, struct request *rq,
1478 struct mapped_device *md)
1479{
1480 int r;
1481 struct request *clone = rq->special;
1482 struct dm_rq_target_io *tio = clone->end_io_data;
1483
1484 /*
1485 * Hold the md reference here for the in-flight I/O.
1486 * We can't rely on the reference count by device opener,
1487 * because the device may be closed during the request completion
1488 * when all bios are completed.
1489 * See the comment in rq_completed() too.
1490 */
1491 dm_get(md);
1492
1493 tio->ti = ti;
1494 r = ti->type->map_rq(ti, clone, &tio->info);
1495 switch (r) {
1496 case DM_MAPIO_SUBMITTED:
1497 /* The target has taken the I/O to submit by itself later */
1498 break;
1499 case DM_MAPIO_REMAPPED:
1500 /* The target has remapped the I/O so dispatch it */
1501 dm_dispatch_request(clone);
1502 break;
1503 case DM_MAPIO_REQUEUE:
1504 /* The target wants to requeue the I/O */
1505 dm_requeue_unmapped_request(clone);
1506 break;
1507 default:
1508 if (r > 0) {
1509 DMWARN("unimplemented target map return value: %d", r);
1510 BUG();
1511 }
1512
1513 /* The target wants to complete the I/O */
1514 dm_kill_unmapped_request(clone, r);
1515 break;
1516 }
1517}
1518
1519/*
1520 * q->request_fn for request-based dm.
1521 * Called with the queue lock held.
1522 */
1523static void dm_request_fn(struct request_queue *q)
1524{
1525 struct mapped_device *md = q->queuedata;
1526 struct dm_table *map = dm_get_table(md);
1527 struct dm_target *ti;
1528 struct request *rq;
1529
1530 /*
1531 * For noflush suspend, check blk_queue_stopped() to immediately
1532 * quit I/O dispatching.
1533 */
1534 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
1535 rq = blk_peek_request(q);
1536 if (!rq)
1537 goto plug_and_out;
1538
1539 if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */
1540 if (queue_in_flight(q))
1541 /* Not quiet yet. Wait more */
1542 goto plug_and_out;
1543
1544 /* This device should be quiet now */
1545 __stop_queue(q);
1546 blk_start_request(rq);
1547 __blk_end_request_all(rq, 0);
1548 wake_up(&md->wait);
1549 goto out;
1550 }
1551
1552 ti = dm_table_find_target(map, blk_rq_pos(rq));
1553 if (ti->type->busy && ti->type->busy(ti))
1554 goto plug_and_out;
1555
1556 blk_start_request(rq);
1557 spin_unlock(q->queue_lock);
1558 map_request(ti, rq, md);
1559 spin_lock_irq(q->queue_lock);
1560 }
1561
1562 goto out;
1563
1564plug_and_out:
1565 if (!elv_queue_empty(q))
1566 /* Some requests still remain, retry later */
1567 blk_plug_device(q);
1568
1569out:
1570 dm_table_put(map);
1571
1572 return;
1573}
1574
1575int dm_underlying_device_busy(struct request_queue *q)
1576{
1577 return blk_lld_busy(q);
1578}
1579EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1580
1581static int dm_lld_busy(struct request_queue *q)
1582{
1583 int r;
1584 struct mapped_device *md = q->queuedata;
1585 struct dm_table *map = dm_get_table(md);
1586
1587 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1588 r = 1;
1589 else
1590 r = dm_table_any_busy_target(map);
1591
1592 dm_table_put(map);
1593
1594 return r;
1595}
1596
986static void dm_unplug_all(struct request_queue *q) 1597static void dm_unplug_all(struct request_queue *q)
987{ 1598{
988 struct mapped_device *md = q->queuedata; 1599 struct mapped_device *md = q->queuedata;
989 struct dm_table *map = dm_get_table(md); 1600 struct dm_table *map = dm_get_table(md);
990 1601
991 if (map) { 1602 if (map) {
1603 if (dm_request_based(md))
1604 generic_unplug_device(q);
1605
992 dm_table_unplug_all(map); 1606 dm_table_unplug_all(map);
993 dm_table_put(map); 1607 dm_table_put(map);
994 } 1608 }
@@ -1003,7 +1617,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
1003 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1617 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1004 map = dm_get_table(md); 1618 map = dm_get_table(md);
1005 if (map) { 1619 if (map) {
1006 r = dm_table_any_congested(map, bdi_bits); 1620 /*
1621 * Request-based dm cares about only own queue for
1622 * the query about congestion status of request_queue
1623 */
1624 if (dm_request_based(md))
1625 r = md->queue->backing_dev_info.state &
1626 bdi_bits;
1627 else
1628 r = dm_table_any_congested(map, bdi_bits);
1629
1007 dm_table_put(map); 1630 dm_table_put(map);
1008 } 1631 }
1009 } 1632 }
@@ -1126,30 +1749,32 @@ static struct mapped_device *alloc_dev(int minor)
1126 INIT_LIST_HEAD(&md->uevent_list); 1749 INIT_LIST_HEAD(&md->uevent_list);
1127 spin_lock_init(&md->uevent_lock); 1750 spin_lock_init(&md->uevent_lock);
1128 1751
1129 md->queue = blk_alloc_queue(GFP_KERNEL); 1752 md->queue = blk_init_queue(dm_request_fn, NULL);
1130 if (!md->queue) 1753 if (!md->queue)
1131 goto bad_queue; 1754 goto bad_queue;
1132 1755
1756 /*
1757 * Request-based dm devices cannot be stacked on top of bio-based dm
1758 * devices. The type of this dm device has not been decided yet,
1759 * although we initialized the queue using blk_init_queue().
1760 * The type is decided at the first table loading time.
1761 * To prevent problematic device stacking, clear the queue flag
1762 * for request stacking support until then.
1763 *
1764 * This queue is new, so no concurrency on the queue_flags.
1765 */
1766 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1767 md->saved_make_request_fn = md->queue->make_request_fn;
1133 md->queue->queuedata = md; 1768 md->queue->queuedata = md;
1134 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1769 md->queue->backing_dev_info.congested_fn = dm_any_congested;
1135 md->queue->backing_dev_info.congested_data = md; 1770 md->queue->backing_dev_info.congested_data = md;
1136 blk_queue_make_request(md->queue, dm_request); 1771 blk_queue_make_request(md->queue, dm_request);
1137 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
1138 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1772 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1139 md->queue->unplug_fn = dm_unplug_all; 1773 md->queue->unplug_fn = dm_unplug_all;
1140 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1774 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1141 1775 blk_queue_softirq_done(md->queue, dm_softirq_done);
1142 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); 1776 blk_queue_prep_rq(md->queue, dm_prep_fn);
1143 if (!md->io_pool) 1777 blk_queue_lld_busy(md->queue, dm_lld_busy);
1144 goto bad_io_pool;
1145
1146 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
1147 if (!md->tio_pool)
1148 goto bad_tio_pool;
1149
1150 md->bs = bioset_create(16, 0);
1151 if (!md->bs)
1152 goto bad_no_bioset;
1153 1778
1154 md->disk = alloc_disk(1); 1779 md->disk = alloc_disk(1);
1155 if (!md->disk) 1780 if (!md->disk)
@@ -1173,6 +1798,10 @@ static struct mapped_device *alloc_dev(int minor)
1173 if (!md->wq) 1798 if (!md->wq)
1174 goto bad_thread; 1799 goto bad_thread;
1175 1800
1801 md->bdev = bdget_disk(md->disk, 0);
1802 if (!md->bdev)
1803 goto bad_bdev;
1804
1176 /* Populate the mapping, nobody knows we exist yet */ 1805 /* Populate the mapping, nobody knows we exist yet */
1177 spin_lock(&_minor_lock); 1806 spin_lock(&_minor_lock);
1178 old_md = idr_replace(&_minor_idr, md, minor); 1807 old_md = idr_replace(&_minor_idr, md, minor);
@@ -1182,15 +1811,11 @@ static struct mapped_device *alloc_dev(int minor)
1182 1811
1183 return md; 1812 return md;
1184 1813
1814bad_bdev:
1815 destroy_workqueue(md->wq);
1185bad_thread: 1816bad_thread:
1186 put_disk(md->disk); 1817 put_disk(md->disk);
1187bad_disk: 1818bad_disk:
1188 bioset_free(md->bs);
1189bad_no_bioset:
1190 mempool_destroy(md->tio_pool);
1191bad_tio_pool:
1192 mempool_destroy(md->io_pool);
1193bad_io_pool:
1194 blk_cleanup_queue(md->queue); 1819 blk_cleanup_queue(md->queue);
1195bad_queue: 1820bad_queue:
1196 free_minor(minor); 1821 free_minor(minor);
@@ -1207,14 +1832,15 @@ static void free_dev(struct mapped_device *md)
1207{ 1832{
1208 int minor = MINOR(disk_devt(md->disk)); 1833 int minor = MINOR(disk_devt(md->disk));
1209 1834
1210 if (md->suspended_bdev) { 1835 unlock_fs(md);
1211 unlock_fs(md); 1836 bdput(md->bdev);
1212 bdput(md->suspended_bdev);
1213 }
1214 destroy_workqueue(md->wq); 1837 destroy_workqueue(md->wq);
1215 mempool_destroy(md->tio_pool); 1838 if (md->tio_pool)
1216 mempool_destroy(md->io_pool); 1839 mempool_destroy(md->tio_pool);
1217 bioset_free(md->bs); 1840 if (md->io_pool)
1841 mempool_destroy(md->io_pool);
1842 if (md->bs)
1843 bioset_free(md->bs);
1218 blk_integrity_unregister(md->disk); 1844 blk_integrity_unregister(md->disk);
1219 del_gendisk(md->disk); 1845 del_gendisk(md->disk);
1220 free_minor(minor); 1846 free_minor(minor);
@@ -1229,6 +1855,29 @@ static void free_dev(struct mapped_device *md)
1229 kfree(md); 1855 kfree(md);
1230} 1856}
1231 1857
1858static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1859{
1860 struct dm_md_mempools *p;
1861
1862 if (md->io_pool && md->tio_pool && md->bs)
1863 /* the md already has necessary mempools */
1864 goto out;
1865
1866 p = dm_table_get_md_mempools(t);
1867 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
1868
1869 md->io_pool = p->io_pool;
1870 p->io_pool = NULL;
1871 md->tio_pool = p->tio_pool;
1872 p->tio_pool = NULL;
1873 md->bs = p->bs;
1874 p->bs = NULL;
1875
1876out:
1877 /* mempool bind completed, now no need any mempools in the table */
1878 dm_table_free_md_mempools(t);
1879}
1880
1232/* 1881/*
1233 * Bind a table to the device. 1882 * Bind a table to the device.
1234 */ 1883 */
@@ -1252,15 +1901,17 @@ static void __set_size(struct mapped_device *md, sector_t size)
1252{ 1901{
1253 set_capacity(md->disk, size); 1902 set_capacity(md->disk, size);
1254 1903
1255 mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); 1904 mutex_lock(&md->bdev->bd_inode->i_mutex);
1256 i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1905 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1257 mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); 1906 mutex_unlock(&md->bdev->bd_inode->i_mutex);
1258} 1907}
1259 1908
1260static int __bind(struct mapped_device *md, struct dm_table *t) 1909static int __bind(struct mapped_device *md, struct dm_table *t,
1910 struct queue_limits *limits)
1261{ 1911{
1262 struct request_queue *q = md->queue; 1912 struct request_queue *q = md->queue;
1263 sector_t size; 1913 sector_t size;
1914 unsigned long flags;
1264 1915
1265 size = dm_table_get_size(t); 1916 size = dm_table_get_size(t);
1266 1917
@@ -1270,8 +1921,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
1270 if (size != get_capacity(md->disk)) 1921 if (size != get_capacity(md->disk))
1271 memset(&md->geometry, 0, sizeof(md->geometry)); 1922 memset(&md->geometry, 0, sizeof(md->geometry));
1272 1923
1273 if (md->suspended_bdev) 1924 __set_size(md, size);
1274 __set_size(md, size);
1275 1925
1276 if (!size) { 1926 if (!size) {
1277 dm_table_destroy(t); 1927 dm_table_destroy(t);
@@ -1280,10 +1930,22 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
1280 1930
1281 dm_table_event_callback(t, event_callback, md); 1931 dm_table_event_callback(t, event_callback, md);
1282 1932
1283 write_lock(&md->map_lock); 1933 /*
1934 * The queue hasn't been stopped yet, if the old table type wasn't
1935 * for request-based during suspension. So stop it to prevent
1936 * I/O mapping before resume.
1937 * This must be done before setting the queue restrictions,
1938 * because request-based dm may be run just after the setting.
1939 */
1940 if (dm_table_request_based(t) && !blk_queue_stopped(q))
1941 stop_queue(q);
1942
1943 __bind_mempools(md, t);
1944
1945 write_lock_irqsave(&md->map_lock, flags);
1284 md->map = t; 1946 md->map = t;
1285 dm_table_set_restrictions(t, q); 1947 dm_table_set_restrictions(t, q, limits);
1286 write_unlock(&md->map_lock); 1948 write_unlock_irqrestore(&md->map_lock, flags);
1287 1949
1288 return 0; 1950 return 0;
1289} 1951}
@@ -1291,14 +1953,15 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
1291static void __unbind(struct mapped_device *md) 1953static void __unbind(struct mapped_device *md)
1292{ 1954{
1293 struct dm_table *map = md->map; 1955 struct dm_table *map = md->map;
1956 unsigned long flags;
1294 1957
1295 if (!map) 1958 if (!map)
1296 return; 1959 return;
1297 1960
1298 dm_table_event_callback(map, NULL, NULL); 1961 dm_table_event_callback(map, NULL, NULL);
1299 write_lock(&md->map_lock); 1962 write_lock_irqsave(&md->map_lock, flags);
1300 md->map = NULL; 1963 md->map = NULL;
1301 write_unlock(&md->map_lock); 1964 write_unlock_irqrestore(&md->map_lock, flags);
1302 dm_table_destroy(map); 1965 dm_table_destroy(map);
1303} 1966}
1304 1967
@@ -1402,6 +2065,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
1402{ 2065{
1403 int r = 0; 2066 int r = 0;
1404 DECLARE_WAITQUEUE(wait, current); 2067 DECLARE_WAITQUEUE(wait, current);
2068 struct request_queue *q = md->queue;
2069 unsigned long flags;
1405 2070
1406 dm_unplug_all(md->queue); 2071 dm_unplug_all(md->queue);
1407 2072
@@ -1411,7 +2076,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
1411 set_current_state(interruptible); 2076 set_current_state(interruptible);
1412 2077
1413 smp_mb(); 2078 smp_mb();
1414 if (!atomic_read(&md->pending)) 2079 if (dm_request_based(md)) {
2080 spin_lock_irqsave(q->queue_lock, flags);
2081 if (!queue_in_flight(q) && blk_queue_stopped(q)) {
2082 spin_unlock_irqrestore(q->queue_lock, flags);
2083 break;
2084 }
2085 spin_unlock_irqrestore(q->queue_lock, flags);
2086 } else if (!atomic_read(&md->pending))
1415 break; 2087 break;
1416 2088
1417 if (interruptible == TASK_INTERRUPTIBLE && 2089 if (interruptible == TASK_INTERRUPTIBLE &&
@@ -1429,34 +2101,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
1429 return r; 2101 return r;
1430} 2102}
1431 2103
1432static int dm_flush(struct mapped_device *md) 2104static void dm_flush(struct mapped_device *md)
1433{ 2105{
1434 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2106 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
1435 return 0; 2107
2108 bio_init(&md->barrier_bio);
2109 md->barrier_bio.bi_bdev = md->bdev;
2110 md->barrier_bio.bi_rw = WRITE_BARRIER;
2111 __split_and_process_bio(md, &md->barrier_bio);
2112
2113 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
1436} 2114}
1437 2115
1438static void process_barrier(struct mapped_device *md, struct bio *bio) 2116static void process_barrier(struct mapped_device *md, struct bio *bio)
1439{ 2117{
1440 int error = dm_flush(md); 2118 md->barrier_error = 0;
1441 2119
1442 if (unlikely(error)) { 2120 dm_flush(md);
1443 bio_endio(bio, error);
1444 return;
1445 }
1446 if (bio_empty_barrier(bio)) {
1447 bio_endio(bio, 0);
1448 return;
1449 }
1450
1451 __split_and_process_bio(md, bio);
1452 2121
1453 error = dm_flush(md); 2122 if (!bio_empty_barrier(bio)) {
1454 2123 __split_and_process_bio(md, bio);
1455 if (!error && md->barrier_error) 2124 dm_flush(md);
1456 error = md->barrier_error; 2125 }
1457 2126
1458 if (md->barrier_error != DM_ENDIO_REQUEUE) 2127 if (md->barrier_error != DM_ENDIO_REQUEUE)
1459 bio_endio(bio, error); 2128 bio_endio(bio, md->barrier_error);
2129 else {
2130 spin_lock_irq(&md->deferred_lock);
2131 bio_list_add_head(&md->deferred, bio);
2132 spin_unlock_irq(&md->deferred_lock);
2133 }
1460} 2134}
1461 2135
1462/* 2136/*
@@ -1482,10 +2156,14 @@ static void dm_wq_work(struct work_struct *work)
1482 2156
1483 up_write(&md->io_lock); 2157 up_write(&md->io_lock);
1484 2158
1485 if (bio_barrier(c)) 2159 if (dm_request_based(md))
1486 process_barrier(md, c); 2160 generic_make_request(c);
1487 else 2161 else {
1488 __split_and_process_bio(md, c); 2162 if (bio_barrier(c))
2163 process_barrier(md, c);
2164 else
2165 __split_and_process_bio(md, c);
2166 }
1489 2167
1490 down_write(&md->io_lock); 2168 down_write(&md->io_lock);
1491 } 2169 }
@@ -1505,6 +2183,7 @@ static void dm_queue_flush(struct mapped_device *md)
1505 */ 2183 */
1506int dm_swap_table(struct mapped_device *md, struct dm_table *table) 2184int dm_swap_table(struct mapped_device *md, struct dm_table *table)
1507{ 2185{
2186 struct queue_limits limits;
1508 int r = -EINVAL; 2187 int r = -EINVAL;
1509 2188
1510 mutex_lock(&md->suspend_lock); 2189 mutex_lock(&md->suspend_lock);
@@ -1513,19 +2192,96 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
1513 if (!dm_suspended(md)) 2192 if (!dm_suspended(md))
1514 goto out; 2193 goto out;
1515 2194
1516 /* without bdev, the device size cannot be changed */ 2195 r = dm_calculate_queue_limits(table, &limits);
1517 if (!md->suspended_bdev) 2196 if (r)
1518 if (get_capacity(md->disk) != dm_table_get_size(table)) 2197 goto out;
1519 goto out; 2198
2199 /* cannot change the device type, once a table is bound */
2200 if (md->map &&
2201 (dm_table_get_type(md->map) != dm_table_get_type(table))) {
2202 DMWARN("can't change the device type after a table is bound");
2203 goto out;
2204 }
2205
2206 /*
2207 * It is enought that blk_queue_ordered() is called only once when
2208 * the first bio-based table is bound.
2209 *
2210 * This setting should be moved to alloc_dev() when request-based dm
2211 * supports barrier.
2212 */
2213 if (!md->map && dm_table_bio_based(table))
2214 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
1520 2215
1521 __unbind(md); 2216 __unbind(md);
1522 r = __bind(md, table); 2217 r = __bind(md, table, &limits);
1523 2218
1524out: 2219out:
1525 mutex_unlock(&md->suspend_lock); 2220 mutex_unlock(&md->suspend_lock);
1526 return r; 2221 return r;
1527} 2222}
1528 2223
2224static void dm_rq_invalidate_suspend_marker(struct mapped_device *md)
2225{
2226 md->suspend_rq.special = (void *)0x1;
2227}
2228
2229static void dm_rq_abort_suspend(struct mapped_device *md, int noflush)
2230{
2231 struct request_queue *q = md->queue;
2232 unsigned long flags;
2233
2234 spin_lock_irqsave(q->queue_lock, flags);
2235 if (!noflush)
2236 dm_rq_invalidate_suspend_marker(md);
2237 __start_queue(q);
2238 spin_unlock_irqrestore(q->queue_lock, flags);
2239}
2240
2241static void dm_rq_start_suspend(struct mapped_device *md, int noflush)
2242{
2243 struct request *rq = &md->suspend_rq;
2244 struct request_queue *q = md->queue;
2245
2246 if (noflush)
2247 stop_queue(q);
2248 else {
2249 blk_rq_init(q, rq);
2250 blk_insert_request(q, rq, 0, NULL);
2251 }
2252}
2253
2254static int dm_rq_suspend_available(struct mapped_device *md, int noflush)
2255{
2256 int r = 1;
2257 struct request *rq = &md->suspend_rq;
2258 struct request_queue *q = md->queue;
2259 unsigned long flags;
2260
2261 if (noflush)
2262 return r;
2263
2264 /* The marker must be protected by queue lock if it is in use */
2265 spin_lock_irqsave(q->queue_lock, flags);
2266 if (unlikely(rq->ref_count)) {
2267 /*
2268 * This can happen, when the previous flush suspend was
2269 * interrupted, the marker is still in the queue and
2270 * this flush suspend has been invoked, because we don't
2271 * remove the marker at the time of suspend interruption.
2272 * We have only one marker per mapped_device, so we can't
2273 * start another flush suspend while it is in use.
2274 */
2275 BUG_ON(!rq->special); /* The marker should be invalidated */
2276 DMWARN("Invalidating the previous flush suspend is still in"
2277 " progress. Please retry later.");
2278 r = 0;
2279 }
2280 spin_unlock_irqrestore(q->queue_lock, flags);
2281
2282 return r;
2283}
2284
1529/* 2285/*
1530 * Functions to lock and unlock any filesystem running on the 2286 * Functions to lock and unlock any filesystem running on the
1531 * device. 2287 * device.
@@ -1536,7 +2292,7 @@ static int lock_fs(struct mapped_device *md)
1536 2292
1537 WARN_ON(md->frozen_sb); 2293 WARN_ON(md->frozen_sb);
1538 2294
1539 md->frozen_sb = freeze_bdev(md->suspended_bdev); 2295 md->frozen_sb = freeze_bdev(md->bdev);
1540 if (IS_ERR(md->frozen_sb)) { 2296 if (IS_ERR(md->frozen_sb)) {
1541 r = PTR_ERR(md->frozen_sb); 2297 r = PTR_ERR(md->frozen_sb);
1542 md->frozen_sb = NULL; 2298 md->frozen_sb = NULL;
@@ -1545,9 +2301,6 @@ static int lock_fs(struct mapped_device *md)
1545 2301
1546 set_bit(DMF_FROZEN, &md->flags); 2302 set_bit(DMF_FROZEN, &md->flags);
1547 2303
1548 /* don't bdput right now, we don't want the bdev
1549 * to go away while it is locked.
1550 */
1551 return 0; 2304 return 0;
1552} 2305}
1553 2306
@@ -1556,7 +2309,7 @@ static void unlock_fs(struct mapped_device *md)
1556 if (!test_bit(DMF_FROZEN, &md->flags)) 2309 if (!test_bit(DMF_FROZEN, &md->flags))
1557 return; 2310 return;
1558 2311
1559 thaw_bdev(md->suspended_bdev, md->frozen_sb); 2312 thaw_bdev(md->bdev, md->frozen_sb);
1560 md->frozen_sb = NULL; 2313 md->frozen_sb = NULL;
1561 clear_bit(DMF_FROZEN, &md->flags); 2314 clear_bit(DMF_FROZEN, &md->flags);
1562} 2315}
@@ -1568,6 +2321,53 @@ static void unlock_fs(struct mapped_device *md)
1568 * dm_bind_table, dm_suspend must be called to flush any in 2321 * dm_bind_table, dm_suspend must be called to flush any in
1569 * flight bios and ensure that any further io gets deferred. 2322 * flight bios and ensure that any further io gets deferred.
1570 */ 2323 */
2324/*
2325 * Suspend mechanism in request-based dm.
2326 *
2327 * After the suspend starts, further incoming requests are kept in
2328 * the request_queue and deferred.
2329 * Remaining requests in the request_queue at the start of suspend are flushed
2330 * if it is flush suspend.
2331 * The suspend completes when the following conditions have been satisfied,
2332 * so wait for it:
2333 * 1. q->in_flight is 0 (which means no in_flight request)
2334 * 2. queue has been stopped (which means no request dispatching)
2335 *
2336 *
2337 * Noflush suspend
2338 * ---------------
2339 * Noflush suspend doesn't need to dispatch remaining requests.
2340 * So stop the queue immediately. Then, wait for all in_flight requests
2341 * to be completed or requeued.
2342 *
2343 * To abort noflush suspend, start the queue.
2344 *
2345 *
2346 * Flush suspend
2347 * -------------
2348 * Flush suspend needs to dispatch remaining requests. So stop the queue
2349 * after the remaining requests are completed. (Requeued request must be also
2350 * re-dispatched and completed. Until then, we can't stop the queue.)
2351 *
2352 * During flushing the remaining requests, further incoming requests are also
2353 * inserted to the same queue. To distinguish which requests are to be
2354 * flushed, we insert a marker request to the queue at the time of starting
2355 * flush suspend, like a barrier.
2356 * The dispatching is blocked when the marker is found on the top of the queue.
2357 * And the queue is stopped when all in_flight requests are completed, since
2358 * that means the remaining requests are completely flushed.
2359 * Then, the marker is removed from the queue.
2360 *
2361 * To abort flush suspend, we also need to take care of the marker, not only
2362 * starting the queue.
2363 * We don't remove the marker forcibly from the queue since it's against
2364 * the block-layer manner. Instead, we put a invalidated mark on the marker.
2365 * When the invalidated marker is found on the top of the queue, it is
2366 * immediately removed from the queue, so it doesn't block dispatching.
2367 * Because we have only one marker per mapped_device, we can't start another
2368 * flush suspend until the invalidated marker is removed from the queue.
2369 * So fail and return with -EBUSY in such a case.
2370 */
1571int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2371int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1572{ 2372{
1573 struct dm_table *map = NULL; 2373 struct dm_table *map = NULL;
@@ -1582,6 +2382,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1582 goto out_unlock; 2382 goto out_unlock;
1583 } 2383 }
1584 2384
2385 if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) {
2386 r = -EBUSY;
2387 goto out_unlock;
2388 }
2389
1585 map = dm_get_table(md); 2390 map = dm_get_table(md);
1586 2391
1587 /* 2392 /*
@@ -1594,24 +2399,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1594 /* This does not get reverted if there's an error later. */ 2399 /* This does not get reverted if there's an error later. */
1595 dm_table_presuspend_targets(map); 2400 dm_table_presuspend_targets(map);
1596 2401
1597 /* bdget() can stall if the pending I/Os are not flushed */ 2402 /*
1598 if (!noflush) { 2403 * Flush I/O to the device. noflush supersedes do_lockfs,
1599 md->suspended_bdev = bdget_disk(md->disk, 0); 2404 * because lock_fs() needs to flush I/Os.
1600 if (!md->suspended_bdev) { 2405 */
1601 DMWARN("bdget failed in dm_suspend"); 2406 if (!noflush && do_lockfs) {
1602 r = -ENOMEM; 2407 r = lock_fs(md);
2408 if (r)
1603 goto out; 2409 goto out;
1604 }
1605
1606 /*
1607 * Flush I/O to the device. noflush supersedes do_lockfs,
1608 * because lock_fs() needs to flush I/Os.
1609 */
1610 if (do_lockfs) {
1611 r = lock_fs(md);
1612 if (r)
1613 goto out;
1614 }
1615 } 2410 }
1616 2411
1617 /* 2412 /*
@@ -1637,6 +2432,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1637 2432
1638 flush_workqueue(md->wq); 2433 flush_workqueue(md->wq);
1639 2434
2435 if (dm_request_based(md))
2436 dm_rq_start_suspend(md, noflush);
2437
1640 /* 2438 /*
1641 * At this point no more requests are entering target request routines. 2439 * At this point no more requests are entering target request routines.
1642 * We call dm_wait_for_completion to wait for all existing requests 2440 * We call dm_wait_for_completion to wait for all existing requests
@@ -1653,6 +2451,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1653 if (r < 0) { 2451 if (r < 0) {
1654 dm_queue_flush(md); 2452 dm_queue_flush(md);
1655 2453
2454 if (dm_request_based(md))
2455 dm_rq_abort_suspend(md, noflush);
2456
1656 unlock_fs(md); 2457 unlock_fs(md);
1657 goto out; /* pushback list is already flushed, so skip flush */ 2458 goto out; /* pushback list is already flushed, so skip flush */
1658 } 2459 }
@@ -1668,11 +2469,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1668 set_bit(DMF_SUSPENDED, &md->flags); 2469 set_bit(DMF_SUSPENDED, &md->flags);
1669 2470
1670out: 2471out:
1671 if (r && md->suspended_bdev) {
1672 bdput(md->suspended_bdev);
1673 md->suspended_bdev = NULL;
1674 }
1675
1676 dm_table_put(map); 2472 dm_table_put(map);
1677 2473
1678out_unlock: 2474out_unlock:
@@ -1699,21 +2495,20 @@ int dm_resume(struct mapped_device *md)
1699 2495
1700 dm_queue_flush(md); 2496 dm_queue_flush(md);
1701 2497
1702 unlock_fs(md); 2498 /*
2499 * Flushing deferred I/Os must be done after targets are resumed
2500 * so that mapping of targets can work correctly.
2501 * Request-based dm is queueing the deferred I/Os in its request_queue.
2502 */
2503 if (dm_request_based(md))
2504 start_queue(md->queue);
1703 2505
1704 if (md->suspended_bdev) { 2506 unlock_fs(md);
1705 bdput(md->suspended_bdev);
1706 md->suspended_bdev = NULL;
1707 }
1708 2507
1709 clear_bit(DMF_SUSPENDED, &md->flags); 2508 clear_bit(DMF_SUSPENDED, &md->flags);
1710 2509
1711 dm_table_unplug_all(map); 2510 dm_table_unplug_all(map);
1712
1713 dm_kobject_uevent(md);
1714
1715 r = 0; 2511 r = 0;
1716
1717out: 2512out:
1718 dm_table_put(map); 2513 dm_table_put(map);
1719 mutex_unlock(&md->suspend_lock); 2514 mutex_unlock(&md->suspend_lock);
@@ -1724,9 +2519,19 @@ out:
1724/*----------------------------------------------------------------- 2519/*-----------------------------------------------------------------
1725 * Event notification. 2520 * Event notification.
1726 *---------------------------------------------------------------*/ 2521 *---------------------------------------------------------------*/
1727void dm_kobject_uevent(struct mapped_device *md) 2522void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
1728{ 2523 unsigned cookie)
1729 kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE); 2524{
2525 char udev_cookie[DM_COOKIE_LENGTH];
2526 char *envp[] = { udev_cookie, NULL };
2527
2528 if (!cookie)
2529 kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2530 else {
2531 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2532 DM_COOKIE_ENV_VAR_NAME, cookie);
2533 kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
2534 }
1730} 2535}
1731 2536
1732uint32_t dm_next_uevent_seq(struct mapped_device *md) 2537uint32_t dm_next_uevent_seq(struct mapped_device *md)
@@ -1780,6 +2585,10 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
1780 if (&md->kobj != kobj) 2585 if (&md->kobj != kobj)
1781 return NULL; 2586 return NULL;
1782 2587
2588 if (test_bit(DMF_FREEING, &md->flags) ||
2589 test_bit(DMF_DELETING, &md->flags))
2590 return NULL;
2591
1783 dm_get(md); 2592 dm_get(md);
1784 return md; 2593 return md;
1785} 2594}
@@ -1800,6 +2609,61 @@ int dm_noflush_suspending(struct dm_target *ti)
1800} 2609}
1801EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2610EXPORT_SYMBOL_GPL(dm_noflush_suspending);
1802 2611
2612struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
2613{
2614 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
2615
2616 if (!pools)
2617 return NULL;
2618
2619 pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
2620 mempool_create_slab_pool(MIN_IOS, _io_cache) :
2621 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
2622 if (!pools->io_pool)
2623 goto free_pools_and_out;
2624
2625 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
2626 mempool_create_slab_pool(MIN_IOS, _tio_cache) :
2627 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
2628 if (!pools->tio_pool)
2629 goto free_io_pool_and_out;
2630
2631 pools->bs = (type == DM_TYPE_BIO_BASED) ?
2632 bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
2633 if (!pools->bs)
2634 goto free_tio_pool_and_out;
2635
2636 return pools;
2637
2638free_tio_pool_and_out:
2639 mempool_destroy(pools->tio_pool);
2640
2641free_io_pool_and_out:
2642 mempool_destroy(pools->io_pool);
2643
2644free_pools_and_out:
2645 kfree(pools);
2646
2647 return NULL;
2648}
2649
2650void dm_free_md_mempools(struct dm_md_mempools *pools)
2651{
2652 if (!pools)
2653 return;
2654
2655 if (pools->io_pool)
2656 mempool_destroy(pools->io_pool);
2657
2658 if (pools->tio_pool)
2659 mempool_destroy(pools->tio_pool);
2660
2661 if (pools->bs)
2662 bioset_free(pools->bs);
2663
2664 kfree(pools);
2665}
2666
1803static struct block_device_operations dm_blk_dops = { 2667static struct block_device_operations dm_blk_dops = {
1804 .open = dm_blk_open, 2668 .open = dm_blk_open,
1805 .release = dm_blk_close, 2669 .release = dm_blk_close,
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a31506d93e91..23278ae80f08 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -23,6 +23,13 @@
23#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) 23#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
24 24
25/* 25/*
26 * Type of table and mapped_device's mempool
27 */
28#define DM_TYPE_NONE 0
29#define DM_TYPE_BIO_BASED 1
30#define DM_TYPE_REQUEST_BASED 2
31
32/*
26 * List of devices that a metadevice uses and should open/close. 33 * List of devices that a metadevice uses and should open/close.
27 */ 34 */
28struct dm_dev_internal { 35struct dm_dev_internal {
@@ -32,6 +39,7 @@ struct dm_dev_internal {
32}; 39};
33 40
34struct dm_table; 41struct dm_table;
42struct dm_md_mempools;
35 43
36/*----------------------------------------------------------------- 44/*-----------------------------------------------------------------
37 * Internal table functions. 45 * Internal table functions.
@@ -41,18 +49,34 @@ void dm_table_event_callback(struct dm_table *t,
41 void (*fn)(void *), void *context); 49 void (*fn)(void *), void *context);
42struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); 50struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
43struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); 51struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
44void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); 52int dm_calculate_queue_limits(struct dm_table *table,
53 struct queue_limits *limits);
54void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
55 struct queue_limits *limits);
45struct list_head *dm_table_get_devices(struct dm_table *t); 56struct list_head *dm_table_get_devices(struct dm_table *t);
46void dm_table_presuspend_targets(struct dm_table *t); 57void dm_table_presuspend_targets(struct dm_table *t);
47void dm_table_postsuspend_targets(struct dm_table *t); 58void dm_table_postsuspend_targets(struct dm_table *t);
48int dm_table_resume_targets(struct dm_table *t); 59int dm_table_resume_targets(struct dm_table *t);
49int dm_table_any_congested(struct dm_table *t, int bdi_bits); 60int dm_table_any_congested(struct dm_table *t, int bdi_bits);
61int dm_table_any_busy_target(struct dm_table *t);
62int dm_table_set_type(struct dm_table *t);
63unsigned dm_table_get_type(struct dm_table *t);
64bool dm_table_bio_based(struct dm_table *t);
65bool dm_table_request_based(struct dm_table *t);
66int dm_table_alloc_md_mempools(struct dm_table *t);
67void dm_table_free_md_mempools(struct dm_table *t);
68struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
50 69
51/* 70/*
52 * To check the return value from dm_table_find_target(). 71 * To check the return value from dm_table_find_target().
53 */ 72 */
54#define dm_target_is_valid(t) ((t)->table) 73#define dm_target_is_valid(t) ((t)->table)
55 74
75/*
76 * To check whether the target type is request-based or not (bio-based).
77 */
78#define dm_target_request_based(t) ((t)->type->map_rq != NULL)
79
56/*----------------------------------------------------------------- 80/*-----------------------------------------------------------------
57 * A registry of target types. 81 * A registry of target types.
58 *---------------------------------------------------------------*/ 82 *---------------------------------------------------------------*/
@@ -92,9 +116,16 @@ void dm_stripe_exit(void);
92int dm_open_count(struct mapped_device *md); 116int dm_open_count(struct mapped_device *md);
93int dm_lock_for_deletion(struct mapped_device *md); 117int dm_lock_for_deletion(struct mapped_device *md);
94 118
95void dm_kobject_uevent(struct mapped_device *md); 119void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
120 unsigned cookie);
96 121
97int dm_kcopyd_init(void); 122int dm_kcopyd_init(void);
98void dm_kcopyd_exit(void); 123void dm_kcopyd_exit(void);
99 124
125/*
126 * Mempool operations
127 */
128struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
129void dm_free_md_mempools(struct dm_md_mempools *pools);
130
100#endif 131#endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 8695809b24b0..87d88dbb667f 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -255,14 +255,14 @@ static void status(struct seq_file *seq, mddev_t *mddev)
255} 255}
256 256
257 257
258static int reconfig(mddev_t *mddev, int layout, int chunk_size) 258static int reshape(mddev_t *mddev)
259{ 259{
260 int mode = layout & ModeMask; 260 int mode = mddev->new_layout & ModeMask;
261 int count = layout >> ModeShift; 261 int count = mddev->new_layout >> ModeShift;
262 conf_t *conf = mddev->private; 262 conf_t *conf = mddev->private;
263 263
264 if (chunk_size != -1) 264 if (mddev->new_layout < 0)
265 return -EINVAL; 265 return 0;
266 266
267 /* new layout */ 267 /* new layout */
268 if (mode == ClearFaults) 268 if (mode == ClearFaults)
@@ -279,6 +279,7 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
279 atomic_set(&conf->counters[mode], count); 279 atomic_set(&conf->counters[mode], count);
280 } else 280 } else
281 return -EINVAL; 281 return -EINVAL;
282 mddev->new_layout = -1;
282 mddev->layout = -1; /* makes sure further changes come through */ 283 mddev->layout = -1; /* makes sure further changes come through */
283 return 0; 284 return 0;
284} 285}
@@ -298,8 +299,12 @@ static int run(mddev_t *mddev)
298{ 299{
299 mdk_rdev_t *rdev; 300 mdk_rdev_t *rdev;
300 int i; 301 int i;
302 conf_t *conf;
303
304 if (md_check_no_bitmap(mddev))
305 return -EINVAL;
301 306
302 conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL); 307 conf = kmalloc(sizeof(*conf), GFP_KERNEL);
303 if (!conf) 308 if (!conf)
304 return -ENOMEM; 309 return -ENOMEM;
305 310
@@ -315,7 +320,7 @@ static int run(mddev_t *mddev)
315 md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); 320 md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
316 mddev->private = conf; 321 mddev->private = conf;
317 322
318 reconfig(mddev, mddev->layout, -1); 323 reshape(mddev);
319 324
320 return 0; 325 return 0;
321} 326}
@@ -338,7 +343,7 @@ static struct mdk_personality faulty_personality =
338 .run = run, 343 .run = run,
339 .stop = stop, 344 .stop = stop,
340 .status = status, 345 .status = status,
341 .reconfig = reconfig, 346 .check_reshape = reshape,
342 .size = faulty_size, 347 .size = faulty_size,
343}; 348};
344 349
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 7a36e38393a1..15c8b7b25a9b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -27,19 +27,27 @@
27 */ 27 */
28static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) 28static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
29{ 29{
30 dev_info_t *hash; 30 int lo, mid, hi;
31 linear_conf_t *conf = mddev_to_conf(mddev); 31 linear_conf_t *conf;
32 sector_t idx = sector >> conf->sector_shift; 32
33 lo = 0;
34 hi = mddev->raid_disks - 1;
35 conf = rcu_dereference(mddev->private);
33 36
34 /* 37 /*
35 * sector_div(a,b) returns the remainer and sets a to a/b 38 * Binary Search
36 */ 39 */
37 (void)sector_div(idx, conf->spacing);
38 hash = conf->hash_table[idx];
39 40
40 while (sector >= hash->num_sectors + hash->start_sector) 41 while (hi > lo) {
41 hash++; 42
42 return hash; 43 mid = (hi + lo) / 2;
44 if (sector < conf->disks[mid].end_sector)
45 hi = mid;
46 else
47 lo = mid + 1;
48 }
49
50 return conf->disks + lo;
43} 51}
44 52
45/** 53/**
@@ -59,8 +67,10 @@ static int linear_mergeable_bvec(struct request_queue *q,
59 unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; 67 unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
60 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 68 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
61 69
70 rcu_read_lock();
62 dev0 = which_dev(mddev, sector); 71 dev0 = which_dev(mddev, sector);
63 maxsectors = dev0->num_sectors - (sector - dev0->start_sector); 72 maxsectors = dev0->end_sector - sector;
73 rcu_read_unlock();
64 74
65 if (maxsectors < bio_sectors) 75 if (maxsectors < bio_sectors)
66 maxsectors = 0; 76 maxsectors = 0;
@@ -79,46 +89,57 @@ static int linear_mergeable_bvec(struct request_queue *q,
79static void linear_unplug(struct request_queue *q) 89static void linear_unplug(struct request_queue *q)
80{ 90{
81 mddev_t *mddev = q->queuedata; 91 mddev_t *mddev = q->queuedata;
82 linear_conf_t *conf = mddev_to_conf(mddev); 92 linear_conf_t *conf;
83 int i; 93 int i;
84 94
95 rcu_read_lock();
96 conf = rcu_dereference(mddev->private);
97
85 for (i=0; i < mddev->raid_disks; i++) { 98 for (i=0; i < mddev->raid_disks; i++) {
86 struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); 99 struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
87 blk_unplug(r_queue); 100 blk_unplug(r_queue);
88 } 101 }
102 rcu_read_unlock();
89} 103}
90 104
91static int linear_congested(void *data, int bits) 105static int linear_congested(void *data, int bits)
92{ 106{
93 mddev_t *mddev = data; 107 mddev_t *mddev = data;
94 linear_conf_t *conf = mddev_to_conf(mddev); 108 linear_conf_t *conf;
95 int i, ret = 0; 109 int i, ret = 0;
96 110
111 rcu_read_lock();
112 conf = rcu_dereference(mddev->private);
113
97 for (i = 0; i < mddev->raid_disks && !ret ; i++) { 114 for (i = 0; i < mddev->raid_disks && !ret ; i++) {
98 struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); 115 struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
99 ret |= bdi_congested(&q->backing_dev_info, bits); 116 ret |= bdi_congested(&q->backing_dev_info, bits);
100 } 117 }
118
119 rcu_read_unlock();
101 return ret; 120 return ret;
102} 121}
103 122
104static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) 123static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
105{ 124{
106 linear_conf_t *conf = mddev_to_conf(mddev); 125 linear_conf_t *conf;
126 sector_t array_sectors;
107 127
128 rcu_read_lock();
129 conf = rcu_dereference(mddev->private);
108 WARN_ONCE(sectors || raid_disks, 130 WARN_ONCE(sectors || raid_disks,
109 "%s does not support generic reshape\n", __func__); 131 "%s does not support generic reshape\n", __func__);
132 array_sectors = conf->array_sectors;
133 rcu_read_unlock();
110 134
111 return conf->array_sectors; 135 return array_sectors;
112} 136}
113 137
114static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) 138static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
115{ 139{
116 linear_conf_t *conf; 140 linear_conf_t *conf;
117 dev_info_t **table;
118 mdk_rdev_t *rdev; 141 mdk_rdev_t *rdev;
119 int i, nb_zone, cnt; 142 int i, cnt;
120 sector_t min_sectors;
121 sector_t curr_sector;
122 143
123 conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t), 144 conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
124 GFP_KERNEL); 145 GFP_KERNEL);
@@ -131,6 +152,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
131 list_for_each_entry(rdev, &mddev->disks, same_set) { 152 list_for_each_entry(rdev, &mddev->disks, same_set) {
132 int j = rdev->raid_disk; 153 int j = rdev->raid_disk;
133 dev_info_t *disk = conf->disks + j; 154 dev_info_t *disk = conf->disks + j;
155 sector_t sectors;
134 156
135 if (j < 0 || j >= raid_disks || disk->rdev) { 157 if (j < 0 || j >= raid_disks || disk->rdev) {
136 printk("linear: disk numbering problem. Aborting!\n"); 158 printk("linear: disk numbering problem. Aborting!\n");
@@ -138,6 +160,11 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
138 } 160 }
139 161
140 disk->rdev = rdev; 162 disk->rdev = rdev;
163 if (mddev->chunk_sectors) {
164 sectors = rdev->sectors;
165 sector_div(sectors, mddev->chunk_sectors);
166 rdev->sectors = sectors * mddev->chunk_sectors;
167 }
141 168
142 blk_queue_stack_limits(mddev->queue, 169 blk_queue_stack_limits(mddev->queue,
143 rdev->bdev->bd_disk->queue); 170 rdev->bdev->bd_disk->queue);
@@ -146,105 +173,27 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
146 * a one page request is never in violation. 173 * a one page request is never in violation.
147 */ 174 */
148 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 175 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
149 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 176 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
150 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 177 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
151 178
152 disk->num_sectors = rdev->sectors;
153 conf->array_sectors += rdev->sectors; 179 conf->array_sectors += rdev->sectors;
154
155 cnt++; 180 cnt++;
181
156 } 182 }
157 if (cnt != raid_disks) { 183 if (cnt != raid_disks) {
158 printk("linear: not enough drives present. Aborting!\n"); 184 printk("linear: not enough drives present. Aborting!\n");
159 goto out; 185 goto out;
160 } 186 }
161 187
162 min_sectors = conf->array_sectors;
163 sector_div(min_sectors, PAGE_SIZE/sizeof(struct dev_info *));
164 if (min_sectors == 0)
165 min_sectors = 1;
166
167 /* min_sectors is the minimum spacing that will fit the hash
168 * table in one PAGE. This may be much smaller than needed.
169 * We find the smallest non-terminal set of consecutive devices
170 * that is larger than min_sectors and use the size of that as
171 * the actual spacing
172 */
173 conf->spacing = conf->array_sectors;
174 for (i=0; i < cnt-1 ; i++) {
175 sector_t tmp = 0;
176 int j;
177 for (j = i; j < cnt - 1 && tmp < min_sectors; j++)
178 tmp += conf->disks[j].num_sectors;
179 if (tmp >= min_sectors && tmp < conf->spacing)
180 conf->spacing = tmp;
181 }
182
183 /* spacing may be too large for sector_div to work with,
184 * so we might need to pre-shift
185 */
186 conf->sector_shift = 0;
187 if (sizeof(sector_t) > sizeof(u32)) {
188 sector_t space = conf->spacing;
189 while (space > (sector_t)(~(u32)0)) {
190 space >>= 1;
191 conf->sector_shift++;
192 }
193 }
194 /* 188 /*
195 * This code was restructured to work around a gcc-2.95.3 internal 189 * Here we calculate the device offsets.
196 * compiler error. Alter it with care.
197 */ 190 */
198 { 191 conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
199 sector_t sz;
200 unsigned round;
201 unsigned long base;
202
203 sz = conf->array_sectors >> conf->sector_shift;
204 sz += 1; /* force round-up */
205 base = conf->spacing >> conf->sector_shift;
206 round = sector_div(sz, base);
207 nb_zone = sz + (round ? 1 : 0);
208 }
209 BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *));
210
211 conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone,
212 GFP_KERNEL);
213 if (!conf->hash_table)
214 goto out;
215 192
216 /*
217 * Here we generate the linear hash table
218 * First calculate the device offsets.
219 */
220 conf->disks[0].start_sector = 0;
221 for (i = 1; i < raid_disks; i++) 193 for (i = 1; i < raid_disks; i++)
222 conf->disks[i].start_sector = 194 conf->disks[i].end_sector =
223 conf->disks[i-1].start_sector + 195 conf->disks[i-1].end_sector +
224 conf->disks[i-1].num_sectors; 196 conf->disks[i].rdev->sectors;
225
226 table = conf->hash_table;
227 i = 0;
228 for (curr_sector = 0;
229 curr_sector < conf->array_sectors;
230 curr_sector += conf->spacing) {
231
232 while (i < raid_disks-1 &&
233 curr_sector >= conf->disks[i+1].start_sector)
234 i++;
235
236 *table ++ = conf->disks + i;
237 }
238
239 if (conf->sector_shift) {
240 conf->spacing >>= conf->sector_shift;
241 /* round spacing up so that when we divide by it,
242 * we err on the side of "too-low", which is safest.
243 */
244 conf->spacing++;
245 }
246
247 BUG_ON(table - conf->hash_table > nb_zone);
248 197
249 return conf; 198 return conf;
250 199
@@ -257,6 +206,8 @@ static int linear_run (mddev_t *mddev)
257{ 206{
258 linear_conf_t *conf; 207 linear_conf_t *conf;
259 208
209 if (md_check_no_bitmap(mddev))
210 return -EINVAL;
260 mddev->queue->queue_lock = &mddev->queue->__queue_lock; 211 mddev->queue->queue_lock = &mddev->queue->__queue_lock;
261 conf = linear_conf(mddev, mddev->raid_disks); 212 conf = linear_conf(mddev, mddev->raid_disks);
262 213
@@ -272,6 +223,12 @@ static int linear_run (mddev_t *mddev)
272 return 0; 223 return 0;
273} 224}
274 225
226static void free_conf(struct rcu_head *head)
227{
228 linear_conf_t *conf = container_of(head, linear_conf_t, rcu);
229 kfree(conf);
230}
231
275static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) 232static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
276{ 233{
277 /* Adding a drive to a linear array allows the array to grow. 234 /* Adding a drive to a linear array allows the array to grow.
@@ -282,7 +239,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
282 * The current one is never freed until the array is stopped. 239 * The current one is never freed until the array is stopped.
283 * This avoids races. 240 * This avoids races.
284 */ 241 */
285 linear_conf_t *newconf; 242 linear_conf_t *newconf, *oldconf;
286 243
287 if (rdev->saved_raid_disk != mddev->raid_disks) 244 if (rdev->saved_raid_disk != mddev->raid_disks)
288 return -EINVAL; 245 return -EINVAL;
@@ -294,25 +251,29 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
294 if (!newconf) 251 if (!newconf)
295 return -ENOMEM; 252 return -ENOMEM;
296 253
297 newconf->prev = mddev_to_conf(mddev); 254 oldconf = rcu_dereference(mddev->private);
298 mddev->private = newconf;
299 mddev->raid_disks++; 255 mddev->raid_disks++;
256 rcu_assign_pointer(mddev->private, newconf);
300 md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); 257 md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
301 set_capacity(mddev->gendisk, mddev->array_sectors); 258 set_capacity(mddev->gendisk, mddev->array_sectors);
259 call_rcu(&oldconf->rcu, free_conf);
302 return 0; 260 return 0;
303} 261}
304 262
305static int linear_stop (mddev_t *mddev) 263static int linear_stop (mddev_t *mddev)
306{ 264{
307 linear_conf_t *conf = mddev_to_conf(mddev); 265 linear_conf_t *conf = mddev->private;
308 266
267 /*
268 * We do not require rcu protection here since
269 * we hold reconfig_mutex for both linear_add and
270 * linear_stop, so they cannot race.
271 * We should make sure any old 'conf's are properly
272 * freed though.
273 */
274 rcu_barrier();
309 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 275 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
310 do { 276 kfree(conf);
311 linear_conf_t *t = conf->prev;
312 kfree(conf->hash_table);
313 kfree(conf);
314 conf = t;
315 } while (conf);
316 277
317 return 0; 278 return 0;
318} 279}
@@ -322,6 +283,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
322 const int rw = bio_data_dir(bio); 283 const int rw = bio_data_dir(bio);
323 mddev_t *mddev = q->queuedata; 284 mddev_t *mddev = q->queuedata;
324 dev_info_t *tmp_dev; 285 dev_info_t *tmp_dev;
286 sector_t start_sector;
325 int cpu; 287 int cpu;
326 288
327 if (unlikely(bio_barrier(bio))) { 289 if (unlikely(bio_barrier(bio))) {
@@ -335,33 +297,36 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
335 bio_sectors(bio)); 297 bio_sectors(bio));
336 part_stat_unlock(); 298 part_stat_unlock();
337 299
300 rcu_read_lock();
338 tmp_dev = which_dev(mddev, bio->bi_sector); 301 tmp_dev = which_dev(mddev, bio->bi_sector);
339 302 start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
340 if (unlikely(bio->bi_sector >= (tmp_dev->num_sectors + 303
341 tmp_dev->start_sector) 304
342 || (bio->bi_sector < 305 if (unlikely(bio->bi_sector >= (tmp_dev->end_sector)
343 tmp_dev->start_sector))) { 306 || (bio->bi_sector < start_sector))) {
344 char b[BDEVNAME_SIZE]; 307 char b[BDEVNAME_SIZE];
345 308
346 printk("linear_make_request: Sector %llu out of bounds on " 309 printk("linear_make_request: Sector %llu out of bounds on "
347 "dev %s: %llu sectors, offset %llu\n", 310 "dev %s: %llu sectors, offset %llu\n",
348 (unsigned long long)bio->bi_sector, 311 (unsigned long long)bio->bi_sector,
349 bdevname(tmp_dev->rdev->bdev, b), 312 bdevname(tmp_dev->rdev->bdev, b),
350 (unsigned long long)tmp_dev->num_sectors, 313 (unsigned long long)tmp_dev->rdev->sectors,
351 (unsigned long long)tmp_dev->start_sector); 314 (unsigned long long)start_sector);
315 rcu_read_unlock();
352 bio_io_error(bio); 316 bio_io_error(bio);
353 return 0; 317 return 0;
354 } 318 }
355 if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > 319 if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
356 tmp_dev->start_sector + tmp_dev->num_sectors)) { 320 tmp_dev->end_sector)) {
357 /* This bio crosses a device boundary, so we have to 321 /* This bio crosses a device boundary, so we have to
358 * split it. 322 * split it.
359 */ 323 */
360 struct bio_pair *bp; 324 struct bio_pair *bp;
325 sector_t end_sector = tmp_dev->end_sector;
326
327 rcu_read_unlock();
361 328
362 bp = bio_split(bio, 329 bp = bio_split(bio, end_sector - bio->bi_sector);
363 tmp_dev->start_sector + tmp_dev->num_sectors
364 - bio->bi_sector);
365 330
366 if (linear_make_request(q, &bp->bio1)) 331 if (linear_make_request(q, &bp->bio1))
367 generic_make_request(&bp->bio1); 332 generic_make_request(&bp->bio1);
@@ -372,8 +337,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
372 } 337 }
373 338
374 bio->bi_bdev = tmp_dev->rdev->bdev; 339 bio->bi_bdev = tmp_dev->rdev->bdev;
375 bio->bi_sector = bio->bi_sector - tmp_dev->start_sector 340 bio->bi_sector = bio->bi_sector - start_sector
376 + tmp_dev->rdev->data_offset; 341 + tmp_dev->rdev->data_offset;
342 rcu_read_unlock();
377 343
378 return 1; 344 return 1;
379} 345}
@@ -381,7 +347,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
381static void linear_status (struct seq_file *seq, mddev_t *mddev) 347static void linear_status (struct seq_file *seq, mddev_t *mddev)
382{ 348{
383 349
384 seq_printf(seq, " %dk rounding", mddev->chunk_size/1024); 350 seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
385} 351}
386 352
387 353
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index bf8179587f95..0ce29b61605a 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -3,27 +3,19 @@
3 3
4struct dev_info { 4struct dev_info {
5 mdk_rdev_t *rdev; 5 mdk_rdev_t *rdev;
6 sector_t num_sectors; 6 sector_t end_sector;
7 sector_t start_sector;
8}; 7};
9 8
10typedef struct dev_info dev_info_t; 9typedef struct dev_info dev_info_t;
11 10
12struct linear_private_data 11struct linear_private_data
13{ 12{
14 struct linear_private_data *prev; /* earlier version */
15 dev_info_t **hash_table;
16 sector_t spacing;
17 sector_t array_sectors; 13 sector_t array_sectors;
18 int sector_shift; /* shift before dividing
19 * by spacing
20 */
21 dev_info_t disks[0]; 14 dev_info_t disks[0];
15 struct rcu_head rcu;
22}; 16};
23 17
24 18
25typedef struct linear_private_data linear_conf_t; 19typedef struct linear_private_data linear_conf_t;
26 20
27#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
28
29#endif 21#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 641b211fe3fe..09be637d52cb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -440,15 +440,6 @@ static inline sector_t calc_dev_sboffset(struct block_device *bdev)
440 return MD_NEW_SIZE_SECTORS(num_sectors); 440 return MD_NEW_SIZE_SECTORS(num_sectors);
441} 441}
442 442
443static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
444{
445 sector_t num_sectors = rdev->sb_start;
446
447 if (chunk_size)
448 num_sectors &= ~((sector_t)chunk_size/512 - 1);
449 return num_sectors;
450}
451
452static int alloc_disk_sb(mdk_rdev_t * rdev) 443static int alloc_disk_sb(mdk_rdev_t * rdev)
453{ 444{
454 if (rdev->sb_page) 445 if (rdev->sb_page)
@@ -745,6 +736,24 @@ struct super_type {
745}; 736};
746 737
747/* 738/*
739 * Check that the given mddev has no bitmap.
740 *
741 * This function is called from the run method of all personalities that do not
742 * support bitmaps. It prints an error message and returns non-zero if mddev
743 * has a bitmap. Otherwise, it returns 0.
744 *
745 */
746int md_check_no_bitmap(mddev_t *mddev)
747{
748 if (!mddev->bitmap_file && !mddev->bitmap_offset)
749 return 0;
750 printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
751 mdname(mddev), mddev->pers->name);
752 return 1;
753}
754EXPORT_SYMBOL(md_check_no_bitmap);
755
756/*
748 * load_super for 0.90.0 757 * load_super for 0.90.0
749 */ 758 */
750static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 759static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
@@ -797,17 +806,6 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
797 rdev->data_offset = 0; 806 rdev->data_offset = 0;
798 rdev->sb_size = MD_SB_BYTES; 807 rdev->sb_size = MD_SB_BYTES;
799 808
800 if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
801 if (sb->level != 1 && sb->level != 4
802 && sb->level != 5 && sb->level != 6
803 && sb->level != 10) {
804 /* FIXME use a better test */
805 printk(KERN_WARNING
806 "md: bitmaps not supported for this level.\n");
807 goto abort;
808 }
809 }
810
811 if (sb->level == LEVEL_MULTIPATH) 809 if (sb->level == LEVEL_MULTIPATH)
812 rdev->desc_nr = -1; 810 rdev->desc_nr = -1;
813 else 811 else
@@ -836,7 +834,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
836 else 834 else
837 ret = 0; 835 ret = 0;
838 } 836 }
839 rdev->sectors = calc_num_sectors(rdev, sb->chunk_size); 837 rdev->sectors = rdev->sb_start;
840 838
841 if (rdev->sectors < sb->size * 2 && sb->level > 1) 839 if (rdev->sectors < sb->size * 2 && sb->level > 1)
842 /* "this cannot possibly happen" ... */ 840 /* "this cannot possibly happen" ... */
@@ -866,7 +864,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
866 mddev->minor_version = sb->minor_version; 864 mddev->minor_version = sb->minor_version;
867 mddev->patch_version = sb->patch_version; 865 mddev->patch_version = sb->patch_version;
868 mddev->external = 0; 866 mddev->external = 0;
869 mddev->chunk_size = sb->chunk_size; 867 mddev->chunk_sectors = sb->chunk_size >> 9;
870 mddev->ctime = sb->ctime; 868 mddev->ctime = sb->ctime;
871 mddev->utime = sb->utime; 869 mddev->utime = sb->utime;
872 mddev->level = sb->level; 870 mddev->level = sb->level;
@@ -883,13 +881,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
883 mddev->delta_disks = sb->delta_disks; 881 mddev->delta_disks = sb->delta_disks;
884 mddev->new_level = sb->new_level; 882 mddev->new_level = sb->new_level;
885 mddev->new_layout = sb->new_layout; 883 mddev->new_layout = sb->new_layout;
886 mddev->new_chunk = sb->new_chunk; 884 mddev->new_chunk_sectors = sb->new_chunk >> 9;
887 } else { 885 } else {
888 mddev->reshape_position = MaxSector; 886 mddev->reshape_position = MaxSector;
889 mddev->delta_disks = 0; 887 mddev->delta_disks = 0;
890 mddev->new_level = mddev->level; 888 mddev->new_level = mddev->level;
891 mddev->new_layout = mddev->layout; 889 mddev->new_layout = mddev->layout;
892 mddev->new_chunk = mddev->chunk_size; 890 mddev->new_chunk_sectors = mddev->chunk_sectors;
893 } 891 }
894 892
895 if (sb->state & (1<<MD_SB_CLEAN)) 893 if (sb->state & (1<<MD_SB_CLEAN))
@@ -1004,7 +1002,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1004 sb->new_level = mddev->new_level; 1002 sb->new_level = mddev->new_level;
1005 sb->delta_disks = mddev->delta_disks; 1003 sb->delta_disks = mddev->delta_disks;
1006 sb->new_layout = mddev->new_layout; 1004 sb->new_layout = mddev->new_layout;
1007 sb->new_chunk = mddev->new_chunk; 1005 sb->new_chunk = mddev->new_chunk_sectors << 9;
1008 } 1006 }
1009 mddev->minor_version = sb->minor_version; 1007 mddev->minor_version = sb->minor_version;
1010 if (mddev->in_sync) 1008 if (mddev->in_sync)
@@ -1018,7 +1016,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1018 sb->recovery_cp = 0; 1016 sb->recovery_cp = 0;
1019 1017
1020 sb->layout = mddev->layout; 1018 sb->layout = mddev->layout;
1021 sb->chunk_size = mddev->chunk_size; 1019 sb->chunk_size = mddev->chunk_sectors << 9;
1022 1020
1023 if (mddev->bitmap && mddev->bitmap_file == NULL) 1021 if (mddev->bitmap && mddev->bitmap_file == NULL)
1024 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1022 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
@@ -1185,24 +1183,13 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1185 bdevname(rdev->bdev,b)); 1183 bdevname(rdev->bdev,b));
1186 return -EINVAL; 1184 return -EINVAL;
1187 } 1185 }
1188 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
1189 if (sb->level != cpu_to_le32(1) &&
1190 sb->level != cpu_to_le32(4) &&
1191 sb->level != cpu_to_le32(5) &&
1192 sb->level != cpu_to_le32(6) &&
1193 sb->level != cpu_to_le32(10)) {
1194 printk(KERN_WARNING
1195 "md: bitmaps not supported for this level.\n");
1196 return -EINVAL;
1197 }
1198 }
1199 1186
1200 rdev->preferred_minor = 0xffff; 1187 rdev->preferred_minor = 0xffff;
1201 rdev->data_offset = le64_to_cpu(sb->data_offset); 1188 rdev->data_offset = le64_to_cpu(sb->data_offset);
1202 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1189 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1203 1190
1204 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1191 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1205 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1192 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1206 if (rdev->sb_size & bmask) 1193 if (rdev->sb_size & bmask)
1207 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1194 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1208 1195
@@ -1248,9 +1235,6 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1248 if (rdev->sectors < le64_to_cpu(sb->data_size)) 1235 if (rdev->sectors < le64_to_cpu(sb->data_size))
1249 return -EINVAL; 1236 return -EINVAL;
1250 rdev->sectors = le64_to_cpu(sb->data_size); 1237 rdev->sectors = le64_to_cpu(sb->data_size);
1251 if (le32_to_cpu(sb->chunksize))
1252 rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1);
1253
1254 if (le64_to_cpu(sb->size) > rdev->sectors) 1238 if (le64_to_cpu(sb->size) > rdev->sectors)
1255 return -EINVAL; 1239 return -EINVAL;
1256 return ret; 1240 return ret;
@@ -1271,7 +1255,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1271 mddev->major_version = 1; 1255 mddev->major_version = 1;
1272 mddev->patch_version = 0; 1256 mddev->patch_version = 0;
1273 mddev->external = 0; 1257 mddev->external = 0;
1274 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1258 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1275 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1259 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1276 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1260 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1277 mddev->level = le32_to_cpu(sb->level); 1261 mddev->level = le32_to_cpu(sb->level);
@@ -1297,13 +1281,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1297 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1281 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1298 mddev->new_level = le32_to_cpu(sb->new_level); 1282 mddev->new_level = le32_to_cpu(sb->new_level);
1299 mddev->new_layout = le32_to_cpu(sb->new_layout); 1283 mddev->new_layout = le32_to_cpu(sb->new_layout);
1300 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; 1284 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1301 } else { 1285 } else {
1302 mddev->reshape_position = MaxSector; 1286 mddev->reshape_position = MaxSector;
1303 mddev->delta_disks = 0; 1287 mddev->delta_disks = 0;
1304 mddev->new_level = mddev->level; 1288 mddev->new_level = mddev->level;
1305 mddev->new_layout = mddev->layout; 1289 mddev->new_layout = mddev->layout;
1306 mddev->new_chunk = mddev->chunk_size; 1290 mddev->new_chunk_sectors = mddev->chunk_sectors;
1307 } 1291 }
1308 1292
1309 } else if (mddev->pers == NULL) { 1293 } else if (mddev->pers == NULL) {
@@ -1375,7 +1359,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1375 1359
1376 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1360 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1377 sb->size = cpu_to_le64(mddev->dev_sectors); 1361 sb->size = cpu_to_le64(mddev->dev_sectors);
1378 sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9); 1362 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1379 sb->level = cpu_to_le32(mddev->level); 1363 sb->level = cpu_to_le32(mddev->level);
1380 sb->layout = cpu_to_le32(mddev->layout); 1364 sb->layout = cpu_to_le32(mddev->layout);
1381 1365
@@ -1402,7 +1386,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1402 sb->new_layout = cpu_to_le32(mddev->new_layout); 1386 sb->new_layout = cpu_to_le32(mddev->new_layout);
1403 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1387 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1404 sb->new_level = cpu_to_le32(mddev->new_level); 1388 sb->new_level = cpu_to_le32(mddev->new_level);
1405 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); 1389 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1406 } 1390 }
1407 1391
1408 max_dev = 0; 1392 max_dev = 0;
@@ -1897,6 +1881,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
1897 int sync_req; 1881 int sync_req;
1898 int nospares = 0; 1882 int nospares = 0;
1899 1883
1884 mddev->utime = get_seconds();
1900 if (mddev->external) 1885 if (mddev->external)
1901 return; 1886 return;
1902repeat: 1887repeat:
@@ -1926,7 +1911,6 @@ repeat:
1926 nospares = 0; 1911 nospares = 0;
1927 1912
1928 sync_req = mddev->in_sync; 1913 sync_req = mddev->in_sync;
1929 mddev->utime = get_seconds();
1930 1914
1931 /* If this is just a dirty<->clean transition, and the array is clean 1915 /* If this is just a dirty<->clean transition, and the array is clean
1932 * and 'events' is odd, we can roll back to the previous clean state */ 1916 * and 'events' is odd, we can roll back to the previous clean state */
@@ -2597,15 +2581,6 @@ static void analyze_sbs(mddev_t * mddev)
2597 clear_bit(In_sync, &rdev->flags); 2581 clear_bit(In_sync, &rdev->flags);
2598 } 2582 }
2599 } 2583 }
2600
2601
2602
2603 if (mddev->recovery_cp != MaxSector &&
2604 mddev->level >= 1)
2605 printk(KERN_ERR "md: %s: raid array is not clean"
2606 " -- starting background reconstruction\n",
2607 mdname(mddev));
2608
2609} 2584}
2610 2585
2611static void md_safemode_timeout(unsigned long data); 2586static void md_safemode_timeout(unsigned long data);
@@ -2746,7 +2721,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2746 if (IS_ERR(priv)) { 2721 if (IS_ERR(priv)) {
2747 mddev->new_level = mddev->level; 2722 mddev->new_level = mddev->level;
2748 mddev->new_layout = mddev->layout; 2723 mddev->new_layout = mddev->layout;
2749 mddev->new_chunk = mddev->chunk_size; 2724 mddev->new_chunk_sectors = mddev->chunk_sectors;
2750 mddev->raid_disks -= mddev->delta_disks; 2725 mddev->raid_disks -= mddev->delta_disks;
2751 mddev->delta_disks = 0; 2726 mddev->delta_disks = 0;
2752 module_put(pers->owner); 2727 module_put(pers->owner);
@@ -2764,7 +2739,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2764 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 2739 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2765 mddev->level = mddev->new_level; 2740 mddev->level = mddev->new_level;
2766 mddev->layout = mddev->new_layout; 2741 mddev->layout = mddev->new_layout;
2767 mddev->chunk_size = mddev->new_chunk; 2742 mddev->chunk_sectors = mddev->new_chunk_sectors;
2768 mddev->delta_disks = 0; 2743 mddev->delta_disks = 0;
2769 pers->run(mddev); 2744 pers->run(mddev);
2770 mddev_resume(mddev); 2745 mddev_resume(mddev);
@@ -2800,11 +2775,14 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
2800 2775
2801 if (mddev->pers) { 2776 if (mddev->pers) {
2802 int err; 2777 int err;
2803 if (mddev->pers->reconfig == NULL) 2778 if (mddev->pers->check_reshape == NULL)
2804 return -EBUSY; 2779 return -EBUSY;
2805 err = mddev->pers->reconfig(mddev, n, -1); 2780 mddev->new_layout = n;
2806 if (err) 2781 err = mddev->pers->check_reshape(mddev);
2782 if (err) {
2783 mddev->new_layout = mddev->layout;
2807 return err; 2784 return err;
2785 }
2808 } else { 2786 } else {
2809 mddev->new_layout = n; 2787 mddev->new_layout = n;
2810 if (mddev->reshape_position == MaxSector) 2788 if (mddev->reshape_position == MaxSector)
@@ -2857,10 +2835,11 @@ static ssize_t
2857chunk_size_show(mddev_t *mddev, char *page) 2835chunk_size_show(mddev_t *mddev, char *page)
2858{ 2836{
2859 if (mddev->reshape_position != MaxSector && 2837 if (mddev->reshape_position != MaxSector &&
2860 mddev->chunk_size != mddev->new_chunk) 2838 mddev->chunk_sectors != mddev->new_chunk_sectors)
2861 return sprintf(page, "%d (%d)\n", mddev->new_chunk, 2839 return sprintf(page, "%d (%d)\n",
2862 mddev->chunk_size); 2840 mddev->new_chunk_sectors << 9,
2863 return sprintf(page, "%d\n", mddev->chunk_size); 2841 mddev->chunk_sectors << 9);
2842 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
2864} 2843}
2865 2844
2866static ssize_t 2845static ssize_t
@@ -2874,15 +2853,18 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2874 2853
2875 if (mddev->pers) { 2854 if (mddev->pers) {
2876 int err; 2855 int err;
2877 if (mddev->pers->reconfig == NULL) 2856 if (mddev->pers->check_reshape == NULL)
2878 return -EBUSY; 2857 return -EBUSY;
2879 err = mddev->pers->reconfig(mddev, -1, n); 2858 mddev->new_chunk_sectors = n >> 9;
2880 if (err) 2859 err = mddev->pers->check_reshape(mddev);
2860 if (err) {
2861 mddev->new_chunk_sectors = mddev->chunk_sectors;
2881 return err; 2862 return err;
2863 }
2882 } else { 2864 } else {
2883 mddev->new_chunk = n; 2865 mddev->new_chunk_sectors = n >> 9;
2884 if (mddev->reshape_position == MaxSector) 2866 if (mddev->reshape_position == MaxSector)
2885 mddev->chunk_size = n; 2867 mddev->chunk_sectors = n >> 9;
2886 } 2868 }
2887 return len; 2869 return len;
2888} 2870}
@@ -3527,8 +3509,9 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3527 return -EBUSY; 3509 return -EBUSY;
3528 3510
3529 /* Must be a multiple of chunk_size */ 3511 /* Must be a multiple of chunk_size */
3530 if (mddev->chunk_size) { 3512 if (mddev->chunk_sectors) {
3531 if (min & (sector_t)((mddev->chunk_size>>9)-1)) 3513 sector_t temp = min;
3514 if (sector_div(temp, mddev->chunk_sectors))
3532 return -EINVAL; 3515 return -EINVAL;
3533 } 3516 }
3534 mddev->resync_min = min; 3517 mddev->resync_min = min;
@@ -3564,8 +3547,9 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3564 return -EBUSY; 3547 return -EBUSY;
3565 3548
3566 /* Must be a multiple of chunk_size */ 3549 /* Must be a multiple of chunk_size */
3567 if (mddev->chunk_size) { 3550 if (mddev->chunk_sectors) {
3568 if (max & (sector_t)((mddev->chunk_size>>9)-1)) 3551 sector_t temp = max;
3552 if (sector_div(temp, mddev->chunk_sectors))
3569 return -EINVAL; 3553 return -EINVAL;
3570 } 3554 }
3571 mddev->resync_max = max; 3555 mddev->resync_max = max;
@@ -3656,7 +3640,7 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3656 mddev->delta_disks = 0; 3640 mddev->delta_disks = 0;
3657 mddev->new_level = mddev->level; 3641 mddev->new_level = mddev->level;
3658 mddev->new_layout = mddev->layout; 3642 mddev->new_layout = mddev->layout;
3659 mddev->new_chunk = mddev->chunk_size; 3643 mddev->new_chunk_sectors = mddev->chunk_sectors;
3660 return len; 3644 return len;
3661} 3645}
3662 3646
@@ -3976,11 +3960,9 @@ static int start_dirty_degraded;
3976static int do_md_run(mddev_t * mddev) 3960static int do_md_run(mddev_t * mddev)
3977{ 3961{
3978 int err; 3962 int err;
3979 int chunk_size;
3980 mdk_rdev_t *rdev; 3963 mdk_rdev_t *rdev;
3981 struct gendisk *disk; 3964 struct gendisk *disk;
3982 struct mdk_personality *pers; 3965 struct mdk_personality *pers;
3983 char b[BDEVNAME_SIZE];
3984 3966
3985 if (list_empty(&mddev->disks)) 3967 if (list_empty(&mddev->disks))
3986 /* cannot run an array with no devices.. */ 3968 /* cannot run an array with no devices.. */
@@ -3998,38 +3980,6 @@ static int do_md_run(mddev_t * mddev)
3998 analyze_sbs(mddev); 3980 analyze_sbs(mddev);
3999 } 3981 }
4000 3982
4001 chunk_size = mddev->chunk_size;
4002
4003 if (chunk_size) {
4004 if (chunk_size > MAX_CHUNK_SIZE) {
4005 printk(KERN_ERR "too big chunk_size: %d > %d\n",
4006 chunk_size, MAX_CHUNK_SIZE);
4007 return -EINVAL;
4008 }
4009 /*
4010 * chunk-size has to be a power of 2
4011 */
4012 if ( (1 << ffz(~chunk_size)) != chunk_size) {
4013 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
4014 return -EINVAL;
4015 }
4016
4017 /* devices must have minimum size of one chunk */
4018 list_for_each_entry(rdev, &mddev->disks, same_set) {
4019 if (test_bit(Faulty, &rdev->flags))
4020 continue;
4021 if (rdev->sectors < chunk_size / 512) {
4022 printk(KERN_WARNING
4023 "md: Dev %s smaller than chunk_size:"
4024 " %llu < %d\n",
4025 bdevname(rdev->bdev,b),
4026 (unsigned long long)rdev->sectors,
4027 chunk_size / 512);
4028 return -EINVAL;
4029 }
4030 }
4031 }
4032
4033 if (mddev->level != LEVEL_NONE) 3983 if (mddev->level != LEVEL_NONE)
4034 request_module("md-level-%d", mddev->level); 3984 request_module("md-level-%d", mddev->level);
4035 else if (mddev->clevel[0]) 3985 else if (mddev->clevel[0])
@@ -4405,7 +4355,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4405 mddev->flags = 0; 4355 mddev->flags = 0;
4406 mddev->ro = 0; 4356 mddev->ro = 0;
4407 mddev->metadata_type[0] = 0; 4357 mddev->metadata_type[0] = 0;
4408 mddev->chunk_size = 0; 4358 mddev->chunk_sectors = 0;
4409 mddev->ctime = mddev->utime = 0; 4359 mddev->ctime = mddev->utime = 0;
4410 mddev->layout = 0; 4360 mddev->layout = 0;
4411 mddev->max_disks = 0; 4361 mddev->max_disks = 0;
@@ -4413,7 +4363,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4413 mddev->delta_disks = 0; 4363 mddev->delta_disks = 0;
4414 mddev->new_level = LEVEL_NONE; 4364 mddev->new_level = LEVEL_NONE;
4415 mddev->new_layout = 0; 4365 mddev->new_layout = 0;
4416 mddev->new_chunk = 0; 4366 mddev->new_chunk_sectors = 0;
4417 mddev->curr_resync = 0; 4367 mddev->curr_resync = 0;
4418 mddev->resync_mismatches = 0; 4368 mddev->resync_mismatches = 0;
4419 mddev->suspend_lo = mddev->suspend_hi = 0; 4369 mddev->suspend_lo = mddev->suspend_hi = 0;
@@ -4618,7 +4568,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
4618 info.spare_disks = spare; 4568 info.spare_disks = spare;
4619 4569
4620 info.layout = mddev->layout; 4570 info.layout = mddev->layout;
4621 info.chunk_size = mddev->chunk_size; 4571 info.chunk_size = mddev->chunk_sectors << 9;
4622 4572
4623 if (copy_to_user(arg, &info, sizeof(info))) 4573 if (copy_to_user(arg, &info, sizeof(info)))
4624 return -EFAULT; 4574 return -EFAULT;
@@ -4843,7 +4793,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4843 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4793 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4844 } else 4794 } else
4845 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4795 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4846 rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); 4796 rdev->sectors = rdev->sb_start;
4847 4797
4848 err = bind_rdev_to_array(rdev, mddev); 4798 err = bind_rdev_to_array(rdev, mddev);
4849 if (err) { 4799 if (err) {
@@ -4913,7 +4863,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
4913 else 4863 else
4914 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4864 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4915 4865
4916 rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); 4866 rdev->sectors = rdev->sb_start;
4917 4867
4918 if (test_bit(Faulty, &rdev->flags)) { 4868 if (test_bit(Faulty, &rdev->flags)) {
4919 printk(KERN_WARNING 4869 printk(KERN_WARNING
@@ -5062,7 +5012,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5062 mddev->external = 0; 5012 mddev->external = 0;
5063 5013
5064 mddev->layout = info->layout; 5014 mddev->layout = info->layout;
5065 mddev->chunk_size = info->chunk_size; 5015 mddev->chunk_sectors = info->chunk_size >> 9;
5066 5016
5067 mddev->max_disks = MD_SB_DISKS; 5017 mddev->max_disks = MD_SB_DISKS;
5068 5018
@@ -5081,7 +5031,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5081 get_random_bytes(mddev->uuid, 16); 5031 get_random_bytes(mddev->uuid, 16);
5082 5032
5083 mddev->new_level = mddev->level; 5033 mddev->new_level = mddev->level;
5084 mddev->new_chunk = mddev->chunk_size; 5034 mddev->new_chunk_sectors = mddev->chunk_sectors;
5085 mddev->new_layout = mddev->layout; 5035 mddev->new_layout = mddev->layout;
5086 mddev->delta_disks = 0; 5036 mddev->delta_disks = 0;
5087 5037
@@ -5191,7 +5141,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5191 mddev->level != info->level || 5141 mddev->level != info->level ||
5192/* mddev->layout != info->layout || */ 5142/* mddev->layout != info->layout || */
5193 !mddev->persistent != info->not_persistent|| 5143 !mddev->persistent != info->not_persistent||
5194 mddev->chunk_size != info->chunk_size || 5144 mddev->chunk_sectors != info->chunk_size >> 9 ||
5195 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 5145 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
5196 ((state^info->state) & 0xfffffe00) 5146 ((state^info->state) & 0xfffffe00)
5197 ) 5147 )
@@ -5215,10 +5165,15 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5215 * we don't need to do anything at the md level, the 5165 * we don't need to do anything at the md level, the
5216 * personality will take care of it all. 5166 * personality will take care of it all.
5217 */ 5167 */
5218 if (mddev->pers->reconfig == NULL) 5168 if (mddev->pers->check_reshape == NULL)
5219 return -EINVAL; 5169 return -EINVAL;
5220 else 5170 else {
5221 return mddev->pers->reconfig(mddev, info->layout, -1); 5171 mddev->new_layout = info->layout;
5172 rv = mddev->pers->check_reshape(mddev);
5173 if (rv)
5174 mddev->new_layout = mddev->layout;
5175 return rv;
5176 }
5222 } 5177 }
5223 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5178 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5224 rv = update_size(mddev, (sector_t)info->size * 2); 5179 rv = update_size(mddev, (sector_t)info->size * 2);
@@ -6717,7 +6672,8 @@ void md_check_recovery(mddev_t *mddev)
6717 */ 6672 */
6718 6673
6719 if (mddev->reshape_position != MaxSector) { 6674 if (mddev->reshape_position != MaxSector) {
6720 if (mddev->pers->check_reshape(mddev) != 0) 6675 if (mddev->pers->check_reshape == NULL ||
6676 mddev->pers->check_reshape(mddev) != 0)
6721 /* Cannot proceed */ 6677 /* Cannot proceed */
6722 goto unlock; 6678 goto unlock;
6723 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6679 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8227ab909d44..9430a110db93 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -30,13 +30,6 @@ typedef struct mddev_s mddev_t;
30typedef struct mdk_rdev_s mdk_rdev_t; 30typedef struct mdk_rdev_s mdk_rdev_t;
31 31
32/* 32/*
33 * options passed in raidrun:
34 */
35
36/* Currently this must fit in an 'int' */
37#define MAX_CHUNK_SIZE (1<<30)
38
39/*
40 * MD's 'extended' device 33 * MD's 'extended' device
41 */ 34 */
42struct mdk_rdev_s 35struct mdk_rdev_s
@@ -145,7 +138,7 @@ struct mddev_s
145 int external; /* metadata is 138 int external; /* metadata is
146 * managed externally */ 139 * managed externally */
147 char metadata_type[17]; /* externally set*/ 140 char metadata_type[17]; /* externally set*/
148 int chunk_size; 141 int chunk_sectors;
149 time_t ctime, utime; 142 time_t ctime, utime;
150 int level, layout; 143 int level, layout;
151 char clevel[16]; 144 char clevel[16];
@@ -166,7 +159,8 @@ struct mddev_s
166 * If reshape_position is MaxSector, then no reshape is happening (yet). 159 * If reshape_position is MaxSector, then no reshape is happening (yet).
167 */ 160 */
168 sector_t reshape_position; 161 sector_t reshape_position;
169 int delta_disks, new_level, new_layout, new_chunk; 162 int delta_disks, new_level, new_layout;
163 int new_chunk_sectors;
170 164
171 struct mdk_thread_s *thread; /* management thread */ 165 struct mdk_thread_s *thread; /* management thread */
172 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ 166 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
@@ -325,7 +319,6 @@ struct mdk_personality
325 int (*check_reshape) (mddev_t *mddev); 319 int (*check_reshape) (mddev_t *mddev);
326 int (*start_reshape) (mddev_t *mddev); 320 int (*start_reshape) (mddev_t *mddev);
327 void (*finish_reshape) (mddev_t *mddev); 321 void (*finish_reshape) (mddev_t *mddev);
328 int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
329 /* quiesce moves between quiescence states 322 /* quiesce moves between quiescence states
330 * 0 - fully active 323 * 0 - fully active
331 * 1 - no new requests allowed 324 * 1 - no new requests allowed
@@ -437,5 +430,6 @@ extern void md_new_event(mddev_t *mddev);
437extern int md_allow_write(mddev_t *mddev); 430extern int md_allow_write(mddev_t *mddev);
438extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 431extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
439extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); 432extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
433extern int md_check_no_bitmap(mddev_t *mddev);
440 434
441#endif /* _MD_MD_H */ 435#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 41ced0cbe823..cbe368fa6598 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -58,7 +58,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
58{ 58{
59 unsigned long flags; 59 unsigned long flags;
60 mddev_t *mddev = mp_bh->mddev; 60 mddev_t *mddev = mp_bh->mddev;
61 multipath_conf_t *conf = mddev_to_conf(mddev); 61 multipath_conf_t *conf = mddev->private;
62 62
63 spin_lock_irqsave(&conf->device_lock, flags); 63 spin_lock_irqsave(&conf->device_lock, flags);
64 list_add(&mp_bh->retry_list, &conf->retry_list); 64 list_add(&mp_bh->retry_list, &conf->retry_list);
@@ -75,7 +75,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
75static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) 75static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
76{ 76{
77 struct bio *bio = mp_bh->master_bio; 77 struct bio *bio = mp_bh->master_bio;
78 multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); 78 multipath_conf_t *conf = mp_bh->mddev->private;
79 79
80 bio_endio(bio, err); 80 bio_endio(bio, err);
81 mempool_free(mp_bh, conf->pool); 81 mempool_free(mp_bh, conf->pool);
@@ -85,7 +85,7 @@ static void multipath_end_request(struct bio *bio, int error)
85{ 85{
86 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 86 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
87 struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); 87 struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
88 multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); 88 multipath_conf_t *conf = mp_bh->mddev->private;
89 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; 89 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
90 90
91 if (uptodate) 91 if (uptodate)
@@ -107,7 +107,7 @@ static void multipath_end_request(struct bio *bio, int error)
107 107
108static void unplug_slaves(mddev_t *mddev) 108static void unplug_slaves(mddev_t *mddev)
109{ 109{
110 multipath_conf_t *conf = mddev_to_conf(mddev); 110 multipath_conf_t *conf = mddev->private;
111 int i; 111 int i;
112 112
113 rcu_read_lock(); 113 rcu_read_lock();
@@ -138,7 +138,7 @@ static void multipath_unplug(struct request_queue *q)
138static int multipath_make_request (struct request_queue *q, struct bio * bio) 138static int multipath_make_request (struct request_queue *q, struct bio * bio)
139{ 139{
140 mddev_t *mddev = q->queuedata; 140 mddev_t *mddev = q->queuedata;
141 multipath_conf_t *conf = mddev_to_conf(mddev); 141 multipath_conf_t *conf = mddev->private;
142 struct multipath_bh * mp_bh; 142 struct multipath_bh * mp_bh;
143 struct multipath_info *multipath; 143 struct multipath_info *multipath;
144 const int rw = bio_data_dir(bio); 144 const int rw = bio_data_dir(bio);
@@ -180,7 +180,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
180 180
181static void multipath_status (struct seq_file *seq, mddev_t *mddev) 181static void multipath_status (struct seq_file *seq, mddev_t *mddev)
182{ 182{
183 multipath_conf_t *conf = mddev_to_conf(mddev); 183 multipath_conf_t *conf = mddev->private;
184 int i; 184 int i;
185 185
186 seq_printf (seq, " [%d/%d] [", conf->raid_disks, 186 seq_printf (seq, " [%d/%d] [", conf->raid_disks,
@@ -195,7 +195,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
195static int multipath_congested(void *data, int bits) 195static int multipath_congested(void *data, int bits)
196{ 196{
197 mddev_t *mddev = data; 197 mddev_t *mddev = data;
198 multipath_conf_t *conf = mddev_to_conf(mddev); 198 multipath_conf_t *conf = mddev->private;
199 int i, ret = 0; 199 int i, ret = 0;
200 200
201 rcu_read_lock(); 201 rcu_read_lock();
@@ -220,7 +220,7 @@ static int multipath_congested(void *data, int bits)
220 */ 220 */
221static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) 221static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
222{ 222{
223 multipath_conf_t *conf = mddev_to_conf(mddev); 223 multipath_conf_t *conf = mddev->private;
224 224
225 if (conf->working_disks <= 1) { 225 if (conf->working_disks <= 1) {
226 /* 226 /*
@@ -303,7 +303,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
303 * merge_bvec_fn will be involved in multipath.) 303 * merge_bvec_fn will be involved in multipath.)
304 */ 304 */
305 if (q->merge_bvec_fn && 305 if (q->merge_bvec_fn &&
306 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 306 queue_max_sectors(q) > (PAGE_SIZE>>9))
307 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 307 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
308 308
309 conf->working_disks++; 309 conf->working_disks++;
@@ -367,7 +367,7 @@ static void multipathd (mddev_t *mddev)
367 struct multipath_bh *mp_bh; 367 struct multipath_bh *mp_bh;
368 struct bio *bio; 368 struct bio *bio;
369 unsigned long flags; 369 unsigned long flags;
370 multipath_conf_t *conf = mddev_to_conf(mddev); 370 multipath_conf_t *conf = mddev->private;
371 struct list_head *head = &conf->retry_list; 371 struct list_head *head = &conf->retry_list;
372 372
373 md_check_recovery(mddev); 373 md_check_recovery(mddev);
@@ -421,6 +421,9 @@ static int multipath_run (mddev_t *mddev)
421 struct multipath_info *disk; 421 struct multipath_info *disk;
422 mdk_rdev_t *rdev; 422 mdk_rdev_t *rdev;
423 423
424 if (md_check_no_bitmap(mddev))
425 return -EINVAL;
426
424 if (mddev->level != LEVEL_MULTIPATH) { 427 if (mddev->level != LEVEL_MULTIPATH) {
425 printk("multipath: %s: raid level not set to multipath IO (%d)\n", 428 printk("multipath: %s: raid level not set to multipath IO (%d)\n",
426 mdname(mddev), mddev->level); 429 mdname(mddev), mddev->level);
@@ -467,7 +470,7 @@ static int multipath_run (mddev_t *mddev)
467 * violating it, not that we ever expect a device with 470 * violating it, not that we ever expect a device with
468 * a merge_bvec_fn to be involved in multipath */ 471 * a merge_bvec_fn to be involved in multipath */
469 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 472 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
470 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 473 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
471 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 474 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
472 475
473 if (!test_bit(Faulty, &rdev->flags)) 476 if (!test_bit(Faulty, &rdev->flags))
@@ -531,7 +534,7 @@ out:
531 534
532static int multipath_stop (mddev_t *mddev) 535static int multipath_stop (mddev_t *mddev)
533{ 536{
534 multipath_conf_t *conf = mddev_to_conf(mddev); 537 multipath_conf_t *conf = mddev->private;
535 538
536 md_unregister_thread(mddev->thread); 539 md_unregister_thread(mddev->thread);
537 mddev->thread = NULL; 540 mddev->thread = NULL;
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index 6fa70b400cda..d1c2a8d78395 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -19,12 +19,6 @@ struct multipath_private_data {
19typedef struct multipath_private_data multipath_conf_t; 19typedef struct multipath_private_data multipath_conf_t;
20 20
21/* 21/*
22 * this is the only point in the RAID code where we violate
23 * C type safety. mddev->private is an 'opaque' pointer.
24 */
25#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
26
27/*
28 * this is our 'private' 'collective' MULTIPATH buffer head. 22 * this is our 'private' 'collective' MULTIPATH buffer head.
29 * it contains information about what kind of IO operations were started 23 * it contains information about what kind of IO operations were started
30 * for this MULTIPATH operation, and about their status: 24 * for this MULTIPATH operation, and about their status:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c08d7559be55..ab4a489d8695 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -26,8 +26,8 @@
26static void raid0_unplug(struct request_queue *q) 26static void raid0_unplug(struct request_queue *q)
27{ 27{
28 mddev_t *mddev = q->queuedata; 28 mddev_t *mddev = q->queuedata;
29 raid0_conf_t *conf = mddev_to_conf(mddev); 29 raid0_conf_t *conf = mddev->private;
30 mdk_rdev_t **devlist = conf->strip_zone[0].dev; 30 mdk_rdev_t **devlist = conf->devlist;
31 int i; 31 int i;
32 32
33 for (i=0; i<mddev->raid_disks; i++) { 33 for (i=0; i<mddev->raid_disks; i++) {
@@ -40,8 +40,8 @@ static void raid0_unplug(struct request_queue *q)
40static int raid0_congested(void *data, int bits) 40static int raid0_congested(void *data, int bits)
41{ 41{
42 mddev_t *mddev = data; 42 mddev_t *mddev = data;
43 raid0_conf_t *conf = mddev_to_conf(mddev); 43 raid0_conf_t *conf = mddev->private;
44 mdk_rdev_t **devlist = conf->strip_zone[0].dev; 44 mdk_rdev_t **devlist = conf->devlist;
45 int i, ret = 0; 45 int i, ret = 0;
46 46
47 for (i = 0; i < mddev->raid_disks && !ret ; i++) { 47 for (i = 0; i < mddev->raid_disks && !ret ; i++) {
@@ -52,27 +52,60 @@ static int raid0_congested(void *data, int bits)
52 return ret; 52 return ret;
53} 53}
54 54
55/*
56 * inform the user of the raid configuration
57*/
58static void dump_zones(mddev_t *mddev)
59{
60 int j, k, h;
61 sector_t zone_size = 0;
62 sector_t zone_start = 0;
63 char b[BDEVNAME_SIZE];
64 raid0_conf_t *conf = mddev->private;
65 printk(KERN_INFO "******* %s configuration *********\n",
66 mdname(mddev));
67 h = 0;
68 for (j = 0; j < conf->nr_strip_zones; j++) {
69 printk(KERN_INFO "zone%d=[", j);
70 for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
71 printk("%s/",
72 bdevname(conf->devlist[j*mddev->raid_disks
73 + k]->bdev, b));
74 printk("]\n");
75
76 zone_size = conf->strip_zone[j].zone_end - zone_start;
77 printk(KERN_INFO " zone offset=%llukb "
78 "device offset=%llukb size=%llukb\n",
79 (unsigned long long)zone_start>>1,
80 (unsigned long long)conf->strip_zone[j].dev_start>>1,
81 (unsigned long long)zone_size>>1);
82 zone_start = conf->strip_zone[j].zone_end;
83 }
84 printk(KERN_INFO "**********************************\n\n");
85}
55 86
56static int create_strip_zones (mddev_t *mddev) 87static int create_strip_zones(mddev_t *mddev)
57{ 88{
58 int i, c, j; 89 int i, c, j, err;
59 sector_t current_start, curr_zone_start; 90 sector_t curr_zone_end, sectors;
60 sector_t min_spacing; 91 mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
61 raid0_conf_t *conf = mddev_to_conf(mddev);
62 mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
63 struct strip_zone *zone; 92 struct strip_zone *zone;
64 int cnt; 93 int cnt;
65 char b[BDEVNAME_SIZE]; 94 char b[BDEVNAME_SIZE];
66 95 raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
67 /* 96
68 * The number of 'same size groups' 97 if (!conf)
69 */ 98 return -ENOMEM;
70 conf->nr_strip_zones = 0;
71
72 list_for_each_entry(rdev1, &mddev->disks, same_set) { 99 list_for_each_entry(rdev1, &mddev->disks, same_set) {
73 printk(KERN_INFO "raid0: looking at %s\n", 100 printk(KERN_INFO "raid0: looking at %s\n",
74 bdevname(rdev1->bdev,b)); 101 bdevname(rdev1->bdev,b));
75 c = 0; 102 c = 0;
103
104 /* round size to chunk_size */
105 sectors = rdev1->sectors;
106 sector_div(sectors, mddev->chunk_sectors);
107 rdev1->sectors = sectors * mddev->chunk_sectors;
108
76 list_for_each_entry(rdev2, &mddev->disks, same_set) { 109 list_for_each_entry(rdev2, &mddev->disks, same_set) {
77 printk(KERN_INFO "raid0: comparing %s(%llu)", 110 printk(KERN_INFO "raid0: comparing %s(%llu)",
78 bdevname(rdev1->bdev,b), 111 bdevname(rdev1->bdev,b),
@@ -103,16 +136,16 @@ static int create_strip_zones (mddev_t *mddev)
103 } 136 }
104 } 137 }
105 printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); 138 printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones);
106 139 err = -ENOMEM;
107 conf->strip_zone = kzalloc(sizeof(struct strip_zone)* 140 conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
108 conf->nr_strip_zones, GFP_KERNEL); 141 conf->nr_strip_zones, GFP_KERNEL);
109 if (!conf->strip_zone) 142 if (!conf->strip_zone)
110 return 1; 143 goto abort;
111 conf->devlist = kzalloc(sizeof(mdk_rdev_t*)* 144 conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
112 conf->nr_strip_zones*mddev->raid_disks, 145 conf->nr_strip_zones*mddev->raid_disks,
113 GFP_KERNEL); 146 GFP_KERNEL);
114 if (!conf->devlist) 147 if (!conf->devlist)
115 return 1; 148 goto abort;
116 149
117 /* The first zone must contain all devices, so here we check that 150 /* The first zone must contain all devices, so here we check that
118 * there is a proper alignment of slots to devices and find them all 151 * there is a proper alignment of slots to devices and find them all
@@ -120,7 +153,8 @@ static int create_strip_zones (mddev_t *mddev)
120 zone = &conf->strip_zone[0]; 153 zone = &conf->strip_zone[0];
121 cnt = 0; 154 cnt = 0;
122 smallest = NULL; 155 smallest = NULL;
123 zone->dev = conf->devlist; 156 dev = conf->devlist;
157 err = -EINVAL;
124 list_for_each_entry(rdev1, &mddev->disks, same_set) { 158 list_for_each_entry(rdev1, &mddev->disks, same_set) {
125 int j = rdev1->raid_disk; 159 int j = rdev1->raid_disk;
126 160
@@ -129,12 +163,12 @@ static int create_strip_zones (mddev_t *mddev)
129 "aborting!\n", j); 163 "aborting!\n", j);
130 goto abort; 164 goto abort;
131 } 165 }
132 if (zone->dev[j]) { 166 if (dev[j]) {
133 printk(KERN_ERR "raid0: multiple devices for %d - " 167 printk(KERN_ERR "raid0: multiple devices for %d - "
134 "aborting!\n", j); 168 "aborting!\n", j);
135 goto abort; 169 goto abort;
136 } 170 }
137 zone->dev[j] = rdev1; 171 dev[j] = rdev1;
138 172
139 blk_queue_stack_limits(mddev->queue, 173 blk_queue_stack_limits(mddev->queue,
140 rdev1->bdev->bd_disk->queue); 174 rdev1->bdev->bd_disk->queue);
@@ -144,7 +178,7 @@ static int create_strip_zones (mddev_t *mddev)
144 */ 178 */
145 179
146 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && 180 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
147 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 181 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
148 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 182 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
149 183
150 if (!smallest || (rdev1->sectors < smallest->sectors)) 184 if (!smallest || (rdev1->sectors < smallest->sectors))
@@ -157,34 +191,32 @@ static int create_strip_zones (mddev_t *mddev)
157 goto abort; 191 goto abort;
158 } 192 }
159 zone->nb_dev = cnt; 193 zone->nb_dev = cnt;
160 zone->sectors = smallest->sectors * cnt; 194 zone->zone_end = smallest->sectors * cnt;
161 zone->zone_start = 0;
162 195
163 current_start = smallest->sectors; 196 curr_zone_end = zone->zone_end;
164 curr_zone_start = zone->sectors;
165 197
166 /* now do the other zones */ 198 /* now do the other zones */
167 for (i = 1; i < conf->nr_strip_zones; i++) 199 for (i = 1; i < conf->nr_strip_zones; i++)
168 { 200 {
169 zone = conf->strip_zone + i; 201 zone = conf->strip_zone + i;
170 zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks; 202 dev = conf->devlist + i * mddev->raid_disks;
171 203
172 printk(KERN_INFO "raid0: zone %d\n", i); 204 printk(KERN_INFO "raid0: zone %d\n", i);
173 zone->dev_start = current_start; 205 zone->dev_start = smallest->sectors;
174 smallest = NULL; 206 smallest = NULL;
175 c = 0; 207 c = 0;
176 208
177 for (j=0; j<cnt; j++) { 209 for (j=0; j<cnt; j++) {
178 char b[BDEVNAME_SIZE]; 210 char b[BDEVNAME_SIZE];
179 rdev = conf->strip_zone[0].dev[j]; 211 rdev = conf->devlist[j];
180 printk(KERN_INFO "raid0: checking %s ...", 212 printk(KERN_INFO "raid0: checking %s ...",
181 bdevname(rdev->bdev, b)); 213 bdevname(rdev->bdev, b));
182 if (rdev->sectors <= current_start) { 214 if (rdev->sectors <= zone->dev_start) {
183 printk(KERN_INFO " nope.\n"); 215 printk(KERN_INFO " nope.\n");
184 continue; 216 continue;
185 } 217 }
186 printk(KERN_INFO " contained as device %d\n", c); 218 printk(KERN_INFO " contained as device %d\n", c);
187 zone->dev[c] = rdev; 219 dev[c] = rdev;
188 c++; 220 c++;
189 if (!smallest || rdev->sectors < smallest->sectors) { 221 if (!smallest || rdev->sectors < smallest->sectors) {
190 smallest = rdev; 222 smallest = rdev;
@@ -194,47 +226,39 @@ static int create_strip_zones (mddev_t *mddev)
194 } 226 }
195 227
196 zone->nb_dev = c; 228 zone->nb_dev = c;
197 zone->sectors = (smallest->sectors - current_start) * c; 229 sectors = (smallest->sectors - zone->dev_start) * c;
198 printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", 230 printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
199 zone->nb_dev, (unsigned long long)zone->sectors); 231 zone->nb_dev, (unsigned long long)sectors);
200 232
201 zone->zone_start = curr_zone_start; 233 curr_zone_end += sectors;
202 curr_zone_start += zone->sectors; 234 zone->zone_end = curr_zone_end;
203 235
204 current_start = smallest->sectors;
205 printk(KERN_INFO "raid0: current zone start: %llu\n", 236 printk(KERN_INFO "raid0: current zone start: %llu\n",
206 (unsigned long long)current_start); 237 (unsigned long long)smallest->sectors);
207 }
208
209 /* Now find appropriate hash spacing.
210 * We want a number which causes most hash entries to cover
211 * at most two strips, but the hash table must be at most
212 * 1 PAGE. We choose the smallest strip, or contiguous collection
213 * of strips, that has big enough size. We never consider the last
214 * strip though as it's size has no bearing on the efficacy of the hash
215 * table.
216 */
217 conf->spacing = curr_zone_start;
218 min_spacing = curr_zone_start;
219 sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
220 for (i=0; i < conf->nr_strip_zones-1; i++) {
221 sector_t s = 0;
222 for (j = i; j < conf->nr_strip_zones - 1 &&
223 s < min_spacing; j++)
224 s += conf->strip_zone[j].sectors;
225 if (s >= min_spacing && s < conf->spacing)
226 conf->spacing = s;
227 } 238 }
228
229 mddev->queue->unplug_fn = raid0_unplug; 239 mddev->queue->unplug_fn = raid0_unplug;
230
231 mddev->queue->backing_dev_info.congested_fn = raid0_congested; 240 mddev->queue->backing_dev_info.congested_fn = raid0_congested;
232 mddev->queue->backing_dev_info.congested_data = mddev; 241 mddev->queue->backing_dev_info.congested_data = mddev;
233 242
243 /*
244 * now since we have the hard sector sizes, we can make sure
245 * chunk size is a multiple of that sector size
246 */
247 if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
248 printk(KERN_ERR "%s chunk_size of %d not valid\n",
249 mdname(mddev),
250 mddev->chunk_sectors << 9);
251 goto abort;
252 }
234 printk(KERN_INFO "raid0: done.\n"); 253 printk(KERN_INFO "raid0: done.\n");
254 mddev->private = conf;
235 return 0; 255 return 0;
236 abort: 256abort:
237 return 1; 257 kfree(conf->strip_zone);
258 kfree(conf->devlist);
259 kfree(conf);
260 mddev->private = NULL;
261 return err;
238} 262}
239 263
240/** 264/**
@@ -252,10 +276,15 @@ static int raid0_mergeable_bvec(struct request_queue *q,
252 mddev_t *mddev = q->queuedata; 276 mddev_t *mddev = q->queuedata;
253 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 277 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
254 int max; 278 int max;
255 unsigned int chunk_sectors = mddev->chunk_size >> 9; 279 unsigned int chunk_sectors = mddev->chunk_sectors;
256 unsigned int bio_sectors = bvm->bi_size >> 9; 280 unsigned int bio_sectors = bvm->bi_size >> 9;
257 281
258 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 282 if (is_power_of_2(chunk_sectors))
283 max = (chunk_sectors - ((sector & (chunk_sectors-1))
284 + bio_sectors)) << 9;
285 else
286 max = (chunk_sectors - (sector_div(sector, chunk_sectors)
287 + bio_sectors)) << 9;
259 if (max < 0) max = 0; /* bio_add cannot handle a negative return */ 288 if (max < 0) max = 0; /* bio_add cannot handle a negative return */
260 if (max <= biovec->bv_len && bio_sectors == 0) 289 if (max <= biovec->bv_len && bio_sectors == 0)
261 return biovec->bv_len; 290 return biovec->bv_len;
@@ -277,84 +306,28 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
277 return array_sectors; 306 return array_sectors;
278} 307}
279 308
280static int raid0_run (mddev_t *mddev) 309static int raid0_run(mddev_t *mddev)
281{ 310{
282 unsigned cur=0, i=0, nb_zone; 311 int ret;
283 s64 sectors;
284 raid0_conf_t *conf;
285 312
286 if (mddev->chunk_size == 0) { 313 if (mddev->chunk_sectors == 0) {
287 printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); 314 printk(KERN_ERR "md/raid0: chunk size must be set.\n");
288 return -EINVAL; 315 return -EINVAL;
289 } 316 }
290 printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n", 317 if (md_check_no_bitmap(mddev))
291 mdname(mddev), 318 return -EINVAL;
292 mddev->chunk_size >> 9, 319 blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors);
293 (mddev->chunk_size>>1)-1);
294 blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
295 blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
296 mddev->queue->queue_lock = &mddev->queue->__queue_lock; 320 mddev->queue->queue_lock = &mddev->queue->__queue_lock;
297 321
298 conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL); 322 ret = create_strip_zones(mddev);
299 if (!conf) 323 if (ret < 0)
300 goto out; 324 return ret;
301 mddev->private = (void *)conf;
302
303 conf->strip_zone = NULL;
304 conf->devlist = NULL;
305 if (create_strip_zones (mddev))
306 goto out_free_conf;
307 325
308 /* calculate array device size */ 326 /* calculate array device size */
309 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); 327 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
310 328
311 printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", 329 printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
312 (unsigned long long)mddev->array_sectors); 330 (unsigned long long)mddev->array_sectors);
313 printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
314 (unsigned long long)conf->spacing);
315 {
316 sector_t s = raid0_size(mddev, 0, 0);
317 sector_t space = conf->spacing;
318 int round;
319 conf->sector_shift = 0;
320 if (sizeof(sector_t) > sizeof(u32)) {
321 /*shift down space and s so that sector_div will work */
322 while (space > (sector_t) (~(u32)0)) {
323 s >>= 1;
324 space >>= 1;
325 s += 1; /* force round-up */
326 conf->sector_shift++;
327 }
328 }
329 round = sector_div(s, (u32)space) ? 1 : 0;
330 nb_zone = s + round;
331 }
332 printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone);
333
334 printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n",
335 nb_zone*sizeof(struct strip_zone*));
336 conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
337 if (!conf->hash_table)
338 goto out_free_conf;
339 sectors = conf->strip_zone[cur].sectors;
340
341 conf->hash_table[0] = conf->strip_zone + cur;
342 for (i=1; i< nb_zone; i++) {
343 while (sectors <= conf->spacing) {
344 cur++;
345 sectors += conf->strip_zone[cur].sectors;
346 }
347 sectors -= conf->spacing;
348 conf->hash_table[i] = conf->strip_zone + cur;
349 }
350 if (conf->sector_shift) {
351 conf->spacing >>= conf->sector_shift;
352 /* round spacing up so when we divide by it, we
353 * err on the side of too-low, which is safest
354 */
355 conf->spacing++;
356 }
357
358 /* calculate the max read-ahead size. 331 /* calculate the max read-ahead size.
359 * For read-ahead of large files to be effective, we need to 332 * For read-ahead of large files to be effective, we need to
360 * readahead at least twice a whole stripe. i.e. number of devices 333 * readahead at least twice a whole stripe. i.e. number of devices
@@ -365,48 +338,107 @@ static int raid0_run (mddev_t *mddev)
365 * chunksize should be used in that case. 338 * chunksize should be used in that case.
366 */ 339 */
367 { 340 {
368 int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE; 341 int stripe = mddev->raid_disks *
342 (mddev->chunk_sectors << 9) / PAGE_SIZE;
369 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) 343 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
370 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 344 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
371 } 345 }
372 346
373
374 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); 347 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
348 dump_zones(mddev);
375 return 0; 349 return 0;
350}
376 351
377out_free_conf: 352static int raid0_stop(mddev_t *mddev)
353{
354 raid0_conf_t *conf = mddev->private;
355
356 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
378 kfree(conf->strip_zone); 357 kfree(conf->strip_zone);
379 kfree(conf->devlist); 358 kfree(conf->devlist);
380 kfree(conf); 359 kfree(conf);
381 mddev->private = NULL; 360 mddev->private = NULL;
382out: 361 return 0;
383 return -ENOMEM;
384} 362}
385 363
386static int raid0_stop (mddev_t *mddev) 364/* Find the zone which holds a particular offset
365 * Update *sectorp to be an offset in that zone
366 */
367static struct strip_zone *find_zone(struct raid0_private_data *conf,
368 sector_t *sectorp)
387{ 369{
388 raid0_conf_t *conf = mddev_to_conf(mddev); 370 int i;
371 struct strip_zone *z = conf->strip_zone;
372 sector_t sector = *sectorp;
373
374 for (i = 0; i < conf->nr_strip_zones; i++)
375 if (sector < z[i].zone_end) {
376 if (i)
377 *sectorp = sector - z[i-1].zone_end;
378 return z + i;
379 }
380 BUG();
381}
389 382
390 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 383/*
391 kfree(conf->hash_table); 384 * remaps the bio to the target device. we separate two flows.
392 conf->hash_table = NULL; 385 * power 2 flow and a general flow for the sake of perfromance
393 kfree(conf->strip_zone); 386*/
394 conf->strip_zone = NULL; 387static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
395 kfree(conf); 388 sector_t sector, sector_t *sector_offset)
396 mddev->private = NULL; 389{
390 unsigned int sect_in_chunk;
391 sector_t chunk;
392 raid0_conf_t *conf = mddev->private;
393 unsigned int chunk_sects = mddev->chunk_sectors;
394
395 if (is_power_of_2(chunk_sects)) {
396 int chunksect_bits = ffz(~chunk_sects);
397 /* find the sector offset inside the chunk */
398 sect_in_chunk = sector & (chunk_sects - 1);
399 sector >>= chunksect_bits;
400 /* chunk in zone */
401 chunk = *sector_offset;
402 /* quotient is the chunk in real device*/
403 sector_div(chunk, zone->nb_dev << chunksect_bits);
404 } else{
405 sect_in_chunk = sector_div(sector, chunk_sects);
406 chunk = *sector_offset;
407 sector_div(chunk, chunk_sects * zone->nb_dev);
408 }
409 /*
410 * position the bio over the real device
411 * real sector = chunk in device + starting of zone
412 * + the position in the chunk
413 */
414 *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
415 return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks
416 + sector_div(sector, zone->nb_dev)];
417}
397 418
398 return 0; 419/*
420 * Is io distribute over 1 or more chunks ?
421*/
422static inline int is_io_in_chunk_boundary(mddev_t *mddev,
423 unsigned int chunk_sects, struct bio *bio)
424{
425 if (likely(is_power_of_2(chunk_sects))) {
426 return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
427 + (bio->bi_size >> 9));
428 } else{
429 sector_t sector = bio->bi_sector;
430 return chunk_sects >= (sector_div(sector, chunk_sects)
431 + (bio->bi_size >> 9));
432 }
399} 433}
400 434
401static int raid0_make_request (struct request_queue *q, struct bio *bio) 435static int raid0_make_request(struct request_queue *q, struct bio *bio)
402{ 436{
403 mddev_t *mddev = q->queuedata; 437 mddev_t *mddev = q->queuedata;
404 unsigned int sect_in_chunk, chunksect_bits, chunk_sects; 438 unsigned int chunk_sects;
405 raid0_conf_t *conf = mddev_to_conf(mddev); 439 sector_t sector_offset;
406 struct strip_zone *zone; 440 struct strip_zone *zone;
407 mdk_rdev_t *tmp_dev; 441 mdk_rdev_t *tmp_dev;
408 sector_t chunk;
409 sector_t sector, rsect;
410 const int rw = bio_data_dir(bio); 442 const int rw = bio_data_dir(bio);
411 int cpu; 443 int cpu;
412 444
@@ -421,11 +453,9 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
421 bio_sectors(bio)); 453 bio_sectors(bio));
422 part_stat_unlock(); 454 part_stat_unlock();
423 455
424 chunk_sects = mddev->chunk_size >> 9; 456 chunk_sects = mddev->chunk_sectors;
425 chunksect_bits = ffz(~chunk_sects); 457 if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
426 sector = bio->bi_sector; 458 sector_t sector = bio->bi_sector;
427
428 if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) {
429 struct bio_pair *bp; 459 struct bio_pair *bp;
430 /* Sanity check -- queue functions should prevent this happening */ 460 /* Sanity check -- queue functions should prevent this happening */
431 if (bio->bi_vcnt != 1 || 461 if (bio->bi_vcnt != 1 ||
@@ -434,7 +464,12 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
434 /* This is a one page bio that upper layers 464 /* This is a one page bio that upper layers
435 * refuse to split for us, so we need to split it. 465 * refuse to split for us, so we need to split it.
436 */ 466 */
437 bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1))); 467 if (likely(is_power_of_2(chunk_sects)))
468 bp = bio_split(bio, chunk_sects - (sector &
469 (chunk_sects-1)));
470 else
471 bp = bio_split(bio, chunk_sects -
472 sector_div(sector, chunk_sects));
438 if (raid0_make_request(q, &bp->bio1)) 473 if (raid0_make_request(q, &bp->bio1))
439 generic_make_request(&bp->bio1); 474 generic_make_request(&bp->bio1);
440 if (raid0_make_request(q, &bp->bio2)) 475 if (raid0_make_request(q, &bp->bio2))
@@ -443,34 +478,14 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
443 bio_pair_release(bp); 478 bio_pair_release(bp);
444 return 0; 479 return 0;
445 } 480 }
446
447
448 {
449 sector_t x = sector >> conf->sector_shift;
450 sector_div(x, (u32)conf->spacing);
451 zone = conf->hash_table[x];
452 }
453 481
454 while (sector >= zone->zone_start + zone->sectors) 482 sector_offset = bio->bi_sector;
455 zone++; 483 zone = find_zone(mddev->private, &sector_offset);
456 484 tmp_dev = map_sector(mddev, zone, bio->bi_sector,
457 sect_in_chunk = bio->bi_sector & (chunk_sects - 1); 485 &sector_offset);
458
459
460 {
461 sector_t x = (sector - zone->zone_start) >> chunksect_bits;
462
463 sector_div(x, zone->nb_dev);
464 chunk = x;
465
466 x = sector >> chunksect_bits;
467 tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
468 }
469 rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk;
470
471 bio->bi_bdev = tmp_dev->bdev; 486 bio->bi_bdev = tmp_dev->bdev;
472 bio->bi_sector = rsect + tmp_dev->data_offset; 487 bio->bi_sector = sector_offset + zone->dev_start +
473 488 tmp_dev->data_offset;
474 /* 489 /*
475 * Let the main block layer submit the IO and resolve recursion: 490 * Let the main block layer submit the IO and resolve recursion:
476 */ 491 */
@@ -485,31 +500,35 @@ bad_map:
485 return 0; 500 return 0;
486} 501}
487 502
488static void raid0_status (struct seq_file *seq, mddev_t *mddev) 503static void raid0_status(struct seq_file *seq, mddev_t *mddev)
489{ 504{
490#undef MD_DEBUG 505#undef MD_DEBUG
491#ifdef MD_DEBUG 506#ifdef MD_DEBUG
492 int j, k, h; 507 int j, k, h;
493 char b[BDEVNAME_SIZE]; 508 char b[BDEVNAME_SIZE];
494 raid0_conf_t *conf = mddev_to_conf(mddev); 509 raid0_conf_t *conf = mddev->private;
495 510
511 sector_t zone_size;
512 sector_t zone_start = 0;
496 h = 0; 513 h = 0;
514
497 for (j = 0; j < conf->nr_strip_zones; j++) { 515 for (j = 0; j < conf->nr_strip_zones; j++) {
498 seq_printf(seq, " z%d", j); 516 seq_printf(seq, " z%d", j);
499 if (conf->hash_table[h] == conf->strip_zone+j)
500 seq_printf(seq, "(h%d)", h++);
501 seq_printf(seq, "=["); 517 seq_printf(seq, "=[");
502 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 518 for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
503 seq_printf(seq, "%s/", bdevname( 519 seq_printf(seq, "%s/", bdevname(
504 conf->strip_zone[j].dev[k]->bdev,b)); 520 conf->devlist[j*mddev->raid_disks + k]
505 521 ->bdev, b));
506 seq_printf(seq, "] zs=%d ds=%d s=%d\n", 522
507 conf->strip_zone[j].zone_start, 523 zone_size = conf->strip_zone[j].zone_end - zone_start;
508 conf->strip_zone[j].dev_start, 524 seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n",
509 conf->strip_zone[j].sectors); 525 (unsigned long long)zone_start>>1,
526 (unsigned long long)conf->strip_zone[j].dev_start>>1,
527 (unsigned long long)zone_size>>1);
528 zone_start = conf->strip_zone[j].zone_end;
510 } 529 }
511#endif 530#endif
512 seq_printf(seq, " %dk chunks", mddev->chunk_size/1024); 531 seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2);
513 return; 532 return;
514} 533}
515 534
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 824b12eb1d4f..91f8e876ee64 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -3,26 +3,18 @@
3 3
4struct strip_zone 4struct strip_zone
5{ 5{
6 sector_t zone_start; /* Zone offset in md_dev (in sectors) */ 6 sector_t zone_end; /* Start of the next zone (in sectors) */
7 sector_t dev_start; /* Zone offset in real dev (in sectors) */ 7 sector_t dev_start; /* Zone offset in real dev (in sectors) */
8 sector_t sectors; /* Zone size in sectors */
9 int nb_dev; /* # of devices attached to the zone */ 8 int nb_dev; /* # of devices attached to the zone */
10 mdk_rdev_t **dev; /* Devices attached to the zone */
11}; 9};
12 10
13struct raid0_private_data 11struct raid0_private_data
14{ 12{
15 struct strip_zone **hash_table; /* Table of indexes into strip_zone */
16 struct strip_zone *strip_zone; 13 struct strip_zone *strip_zone;
17 mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ 14 mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
18 int nr_strip_zones; 15 int nr_strip_zones;
19
20 sector_t spacing;
21 int sector_shift; /* shift this before divide by spacing */
22}; 16};
23 17
24typedef struct raid0_private_data raid0_conf_t; 18typedef struct raid0_private_data raid0_conf_t;
25 19
26#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
27
28#endif 20#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 36df9109cde1..89939a7aef57 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -182,7 +182,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
182 182
183static void free_r1bio(r1bio_t *r1_bio) 183static void free_r1bio(r1bio_t *r1_bio)
184{ 184{
185 conf_t *conf = mddev_to_conf(r1_bio->mddev); 185 conf_t *conf = r1_bio->mddev->private;
186 186
187 /* 187 /*
188 * Wake up any possible resync thread that waits for the device 188 * Wake up any possible resync thread that waits for the device
@@ -196,7 +196,7 @@ static void free_r1bio(r1bio_t *r1_bio)
196 196
197static void put_buf(r1bio_t *r1_bio) 197static void put_buf(r1bio_t *r1_bio)
198{ 198{
199 conf_t *conf = mddev_to_conf(r1_bio->mddev); 199 conf_t *conf = r1_bio->mddev->private;
200 int i; 200 int i;
201 201
202 for (i=0; i<conf->raid_disks; i++) { 202 for (i=0; i<conf->raid_disks; i++) {
@@ -214,7 +214,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
214{ 214{
215 unsigned long flags; 215 unsigned long flags;
216 mddev_t *mddev = r1_bio->mddev; 216 mddev_t *mddev = r1_bio->mddev;
217 conf_t *conf = mddev_to_conf(mddev); 217 conf_t *conf = mddev->private;
218 218
219 spin_lock_irqsave(&conf->device_lock, flags); 219 spin_lock_irqsave(&conf->device_lock, flags);
220 list_add(&r1_bio->retry_list, &conf->retry_list); 220 list_add(&r1_bio->retry_list, &conf->retry_list);
@@ -253,7 +253,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
253 */ 253 */
254static inline void update_head_pos(int disk, r1bio_t *r1_bio) 254static inline void update_head_pos(int disk, r1bio_t *r1_bio)
255{ 255{
256 conf_t *conf = mddev_to_conf(r1_bio->mddev); 256 conf_t *conf = r1_bio->mddev->private;
257 257
258 conf->mirrors[disk].head_position = 258 conf->mirrors[disk].head_position =
259 r1_bio->sector + (r1_bio->sectors); 259 r1_bio->sector + (r1_bio->sectors);
@@ -264,7 +264,7 @@ static void raid1_end_read_request(struct bio *bio, int error)
264 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 264 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
265 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 265 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
266 int mirror; 266 int mirror;
267 conf_t *conf = mddev_to_conf(r1_bio->mddev); 267 conf_t *conf = r1_bio->mddev->private;
268 268
269 mirror = r1_bio->read_disk; 269 mirror = r1_bio->read_disk;
270 /* 270 /*
@@ -309,7 +309,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
309 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 309 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
310 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 310 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
311 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 311 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
312 conf_t *conf = mddev_to_conf(r1_bio->mddev); 312 conf_t *conf = r1_bio->mddev->private;
313 struct bio *to_put = NULL; 313 struct bio *to_put = NULL;
314 314
315 315
@@ -541,7 +541,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
541 541
542static void unplug_slaves(mddev_t *mddev) 542static void unplug_slaves(mddev_t *mddev)
543{ 543{
544 conf_t *conf = mddev_to_conf(mddev); 544 conf_t *conf = mddev->private;
545 int i; 545 int i;
546 546
547 rcu_read_lock(); 547 rcu_read_lock();
@@ -573,7 +573,7 @@ static void raid1_unplug(struct request_queue *q)
573static int raid1_congested(void *data, int bits) 573static int raid1_congested(void *data, int bits)
574{ 574{
575 mddev_t *mddev = data; 575 mddev_t *mddev = data;
576 conf_t *conf = mddev_to_conf(mddev); 576 conf_t *conf = mddev->private;
577 int i, ret = 0; 577 int i, ret = 0;
578 578
579 rcu_read_lock(); 579 rcu_read_lock();
@@ -772,7 +772,7 @@ do_sync_io:
772static int make_request(struct request_queue *q, struct bio * bio) 772static int make_request(struct request_queue *q, struct bio * bio)
773{ 773{
774 mddev_t *mddev = q->queuedata; 774 mddev_t *mddev = q->queuedata;
775 conf_t *conf = mddev_to_conf(mddev); 775 conf_t *conf = mddev->private;
776 mirror_info_t *mirror; 776 mirror_info_t *mirror;
777 r1bio_t *r1_bio; 777 r1bio_t *r1_bio;
778 struct bio *read_bio; 778 struct bio *read_bio;
@@ -991,7 +991,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
991 991
992static void status(struct seq_file *seq, mddev_t *mddev) 992static void status(struct seq_file *seq, mddev_t *mddev)
993{ 993{
994 conf_t *conf = mddev_to_conf(mddev); 994 conf_t *conf = mddev->private;
995 int i; 995 int i;
996 996
997 seq_printf(seq, " [%d/%d] [", conf->raid_disks, 997 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
@@ -1010,7 +1010,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
1010static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1010static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1011{ 1011{
1012 char b[BDEVNAME_SIZE]; 1012 char b[BDEVNAME_SIZE];
1013 conf_t *conf = mddev_to_conf(mddev); 1013 conf_t *conf = mddev->private;
1014 1014
1015 /* 1015 /*
1016 * If it is not operational, then we have already marked it as dead 1016 * If it is not operational, then we have already marked it as dead
@@ -1130,7 +1130,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1130 * a one page request is never in violation. 1130 * a one page request is never in violation.
1131 */ 1131 */
1132 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 1132 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1133 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 1133 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
1134 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 1134 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
1135 1135
1136 p->head_position = 0; 1136 p->head_position = 0;
@@ -1214,7 +1214,7 @@ static void end_sync_write(struct bio *bio, int error)
1214 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1214 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1215 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1215 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
1216 mddev_t *mddev = r1_bio->mddev; 1216 mddev_t *mddev = r1_bio->mddev;
1217 conf_t *conf = mddev_to_conf(mddev); 1217 conf_t *conf = mddev->private;
1218 int i; 1218 int i;
1219 int mirror=0; 1219 int mirror=0;
1220 1220
@@ -1248,7 +1248,7 @@ static void end_sync_write(struct bio *bio, int error)
1248 1248
1249static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) 1249static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1250{ 1250{
1251 conf_t *conf = mddev_to_conf(mddev); 1251 conf_t *conf = mddev->private;
1252 int i; 1252 int i;
1253 int disks = conf->raid_disks; 1253 int disks = conf->raid_disks;
1254 struct bio *bio, *wbio; 1254 struct bio *bio, *wbio;
@@ -1562,7 +1562,7 @@ static void raid1d(mddev_t *mddev)
1562 r1bio_t *r1_bio; 1562 r1bio_t *r1_bio;
1563 struct bio *bio; 1563 struct bio *bio;
1564 unsigned long flags; 1564 unsigned long flags;
1565 conf_t *conf = mddev_to_conf(mddev); 1565 conf_t *conf = mddev->private;
1566 struct list_head *head = &conf->retry_list; 1566 struct list_head *head = &conf->retry_list;
1567 int unplug=0; 1567 int unplug=0;
1568 mdk_rdev_t *rdev; 1568 mdk_rdev_t *rdev;
@@ -1585,7 +1585,7 @@ static void raid1d(mddev_t *mddev)
1585 spin_unlock_irqrestore(&conf->device_lock, flags); 1585 spin_unlock_irqrestore(&conf->device_lock, flags);
1586 1586
1587 mddev = r1_bio->mddev; 1587 mddev = r1_bio->mddev;
1588 conf = mddev_to_conf(mddev); 1588 conf = mddev->private;
1589 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1589 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1590 sync_request_write(mddev, r1_bio); 1590 sync_request_write(mddev, r1_bio);
1591 unplug = 1; 1591 unplug = 1;
@@ -1706,7 +1706,7 @@ static int init_resync(conf_t *conf)
1706 1706
1707static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 1707static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1708{ 1708{
1709 conf_t *conf = mddev_to_conf(mddev); 1709 conf_t *conf = mddev->private;
1710 r1bio_t *r1_bio; 1710 r1bio_t *r1_bio;
1711 struct bio *bio; 1711 struct bio *bio;
1712 sector_t max_sector, nr_sectors; 1712 sector_t max_sector, nr_sectors;
@@ -1996,7 +1996,7 @@ static int run(mddev_t *mddev)
1996 * a one page request is never in violation. 1996 * a one page request is never in violation.
1997 */ 1997 */
1998 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 1998 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1999 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 1999 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
2000 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 2000 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
2001 2001
2002 disk->head_position = 0; 2002 disk->head_position = 0;
@@ -2052,6 +2052,10 @@ static int run(mddev_t *mddev)
2052 goto out_free_conf; 2052 goto out_free_conf;
2053 } 2053 }
2054 2054
2055 if (mddev->recovery_cp != MaxSector)
2056 printk(KERN_NOTICE "raid1: %s is not clean"
2057 " -- starting background reconstruction\n",
2058 mdname(mddev));
2055 printk(KERN_INFO 2059 printk(KERN_INFO
2056 "raid1: raid set %s active with %d out of %d mirrors\n", 2060 "raid1: raid set %s active with %d out of %d mirrors\n",
2057 mdname(mddev), mddev->raid_disks - mddev->degraded, 2061 mdname(mddev), mddev->raid_disks - mddev->degraded,
@@ -2087,7 +2091,7 @@ out:
2087 2091
2088static int stop(mddev_t *mddev) 2092static int stop(mddev_t *mddev)
2089{ 2093{
2090 conf_t *conf = mddev_to_conf(mddev); 2094 conf_t *conf = mddev->private;
2091 struct bitmap *bitmap = mddev->bitmap; 2095 struct bitmap *bitmap = mddev->bitmap;
2092 int behind_wait = 0; 2096 int behind_wait = 0;
2093 2097
@@ -2155,16 +2159,16 @@ static int raid1_reshape(mddev_t *mddev)
2155 mempool_t *newpool, *oldpool; 2159 mempool_t *newpool, *oldpool;
2156 struct pool_info *newpoolinfo; 2160 struct pool_info *newpoolinfo;
2157 mirror_info_t *newmirrors; 2161 mirror_info_t *newmirrors;
2158 conf_t *conf = mddev_to_conf(mddev); 2162 conf_t *conf = mddev->private;
2159 int cnt, raid_disks; 2163 int cnt, raid_disks;
2160 unsigned long flags; 2164 unsigned long flags;
2161 int d, d2, err; 2165 int d, d2, err;
2162 2166
2163 /* Cannot change chunk_size, layout, or level */ 2167 /* Cannot change chunk_size, layout, or level */
2164 if (mddev->chunk_size != mddev->new_chunk || 2168 if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
2165 mddev->layout != mddev->new_layout || 2169 mddev->layout != mddev->new_layout ||
2166 mddev->level != mddev->new_level) { 2170 mddev->level != mddev->new_level) {
2167 mddev->new_chunk = mddev->chunk_size; 2171 mddev->new_chunk_sectors = mddev->chunk_sectors;
2168 mddev->new_layout = mddev->layout; 2172 mddev->new_layout = mddev->layout;
2169 mddev->new_level = mddev->level; 2173 mddev->new_level = mddev->level;
2170 return -EINVAL; 2174 return -EINVAL;
@@ -2252,7 +2256,7 @@ static int raid1_reshape(mddev_t *mddev)
2252 2256
2253static void raid1_quiesce(mddev_t *mddev, int state) 2257static void raid1_quiesce(mddev_t *mddev, int state)
2254{ 2258{
2255 conf_t *conf = mddev_to_conf(mddev); 2259 conf_t *conf = mddev->private;
2256 2260
2257 switch(state) { 2261 switch(state) {
2258 case 1: 2262 case 1:
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 1620eea3d57c..e87b84deff68 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -64,12 +64,6 @@ struct r1_private_data_s {
64typedef struct r1_private_data_s conf_t; 64typedef struct r1_private_data_s conf_t;
65 65
66/* 66/*
67 * this is the only point in the RAID code where we violate
68 * C type safety. mddev->private is an 'opaque' pointer.
69 */
70#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
71
72/*
73 * this is our 'private' RAID1 bio. 67 * this is our 'private' RAID1 bio.
74 * 68 *
75 * it contains information about what kind of IO operations were started 69 * it contains information about what kind of IO operations were started
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 499620afb44b..ae12ceafe10c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -188,7 +188,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
188 188
189static void free_r10bio(r10bio_t *r10_bio) 189static void free_r10bio(r10bio_t *r10_bio)
190{ 190{
191 conf_t *conf = mddev_to_conf(r10_bio->mddev); 191 conf_t *conf = r10_bio->mddev->private;
192 192
193 /* 193 /*
194 * Wake up any possible resync thread that waits for the device 194 * Wake up any possible resync thread that waits for the device
@@ -202,7 +202,7 @@ static void free_r10bio(r10bio_t *r10_bio)
202 202
203static void put_buf(r10bio_t *r10_bio) 203static void put_buf(r10bio_t *r10_bio)
204{ 204{
205 conf_t *conf = mddev_to_conf(r10_bio->mddev); 205 conf_t *conf = r10_bio->mddev->private;
206 206
207 mempool_free(r10_bio, conf->r10buf_pool); 207 mempool_free(r10_bio, conf->r10buf_pool);
208 208
@@ -213,7 +213,7 @@ static void reschedule_retry(r10bio_t *r10_bio)
213{ 213{
214 unsigned long flags; 214 unsigned long flags;
215 mddev_t *mddev = r10_bio->mddev; 215 mddev_t *mddev = r10_bio->mddev;
216 conf_t *conf = mddev_to_conf(mddev); 216 conf_t *conf = mddev->private;
217 217
218 spin_lock_irqsave(&conf->device_lock, flags); 218 spin_lock_irqsave(&conf->device_lock, flags);
219 list_add(&r10_bio->retry_list, &conf->retry_list); 219 list_add(&r10_bio->retry_list, &conf->retry_list);
@@ -245,7 +245,7 @@ static void raid_end_bio_io(r10bio_t *r10_bio)
245 */ 245 */
246static inline void update_head_pos(int slot, r10bio_t *r10_bio) 246static inline void update_head_pos(int slot, r10bio_t *r10_bio)
247{ 247{
248 conf_t *conf = mddev_to_conf(r10_bio->mddev); 248 conf_t *conf = r10_bio->mddev->private;
249 249
250 conf->mirrors[r10_bio->devs[slot].devnum].head_position = 250 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
251 r10_bio->devs[slot].addr + (r10_bio->sectors); 251 r10_bio->devs[slot].addr + (r10_bio->sectors);
@@ -256,7 +256,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
256 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 256 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
257 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 257 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
258 int slot, dev; 258 int slot, dev;
259 conf_t *conf = mddev_to_conf(r10_bio->mddev); 259 conf_t *conf = r10_bio->mddev->private;
260 260
261 261
262 slot = r10_bio->read_slot; 262 slot = r10_bio->read_slot;
@@ -297,7 +297,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
297 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 297 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
298 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 298 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
299 int slot, dev; 299 int slot, dev;
300 conf_t *conf = mddev_to_conf(r10_bio->mddev); 300 conf_t *conf = r10_bio->mddev->private;
301 301
302 for (slot = 0; slot < conf->copies; slot++) 302 for (slot = 0; slot < conf->copies; slot++)
303 if (r10_bio->devs[slot].bio == bio) 303 if (r10_bio->devs[slot].bio == bio)
@@ -461,7 +461,7 @@ static int raid10_mergeable_bvec(struct request_queue *q,
461 mddev_t *mddev = q->queuedata; 461 mddev_t *mddev = q->queuedata;
462 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 462 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
463 int max; 463 int max;
464 unsigned int chunk_sectors = mddev->chunk_size >> 9; 464 unsigned int chunk_sectors = mddev->chunk_sectors;
465 unsigned int bio_sectors = bvm->bi_size >> 9; 465 unsigned int bio_sectors = bvm->bi_size >> 9;
466 466
467 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 467 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
@@ -596,7 +596,7 @@ rb_out:
596 596
597static void unplug_slaves(mddev_t *mddev) 597static void unplug_slaves(mddev_t *mddev)
598{ 598{
599 conf_t *conf = mddev_to_conf(mddev); 599 conf_t *conf = mddev->private;
600 int i; 600 int i;
601 601
602 rcu_read_lock(); 602 rcu_read_lock();
@@ -628,7 +628,7 @@ static void raid10_unplug(struct request_queue *q)
628static int raid10_congested(void *data, int bits) 628static int raid10_congested(void *data, int bits)
629{ 629{
630 mddev_t *mddev = data; 630 mddev_t *mddev = data;
631 conf_t *conf = mddev_to_conf(mddev); 631 conf_t *conf = mddev->private;
632 int i, ret = 0; 632 int i, ret = 0;
633 633
634 rcu_read_lock(); 634 rcu_read_lock();
@@ -788,7 +788,7 @@ static void unfreeze_array(conf_t *conf)
788static int make_request(struct request_queue *q, struct bio * bio) 788static int make_request(struct request_queue *q, struct bio * bio)
789{ 789{
790 mddev_t *mddev = q->queuedata; 790 mddev_t *mddev = q->queuedata;
791 conf_t *conf = mddev_to_conf(mddev); 791 conf_t *conf = mddev->private;
792 mirror_info_t *mirror; 792 mirror_info_t *mirror;
793 r10bio_t *r10_bio; 793 r10bio_t *r10_bio;
794 struct bio *read_bio; 794 struct bio *read_bio;
@@ -981,11 +981,11 @@ static int make_request(struct request_queue *q, struct bio * bio)
981 981
982static void status(struct seq_file *seq, mddev_t *mddev) 982static void status(struct seq_file *seq, mddev_t *mddev)
983{ 983{
984 conf_t *conf = mddev_to_conf(mddev); 984 conf_t *conf = mddev->private;
985 int i; 985 int i;
986 986
987 if (conf->near_copies < conf->raid_disks) 987 if (conf->near_copies < conf->raid_disks)
988 seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); 988 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
989 if (conf->near_copies > 1) 989 if (conf->near_copies > 1)
990 seq_printf(seq, " %d near-copies", conf->near_copies); 990 seq_printf(seq, " %d near-copies", conf->near_copies);
991 if (conf->far_copies > 1) { 991 if (conf->far_copies > 1) {
@@ -1006,7 +1006,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
1006static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1006static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1007{ 1007{
1008 char b[BDEVNAME_SIZE]; 1008 char b[BDEVNAME_SIZE];
1009 conf_t *conf = mddev_to_conf(mddev); 1009 conf_t *conf = mddev->private;
1010 1010
1011 /* 1011 /*
1012 * If it is not operational, then we have already marked it as dead 1012 * If it is not operational, then we have already marked it as dead
@@ -1158,8 +1158,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1158 * a one page request is never in violation. 1158 * a one page request is never in violation.
1159 */ 1159 */
1160 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 1160 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1161 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 1161 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
1162 mddev->queue->max_sectors = (PAGE_SIZE>>9); 1162 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
1163 1163
1164 p->head_position = 0; 1164 p->head_position = 0;
1165 rdev->raid_disk = mirror; 1165 rdev->raid_disk = mirror;
@@ -1215,7 +1215,7 @@ abort:
1215static void end_sync_read(struct bio *bio, int error) 1215static void end_sync_read(struct bio *bio, int error)
1216{ 1216{
1217 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 1217 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1218 conf_t *conf = mddev_to_conf(r10_bio->mddev); 1218 conf_t *conf = r10_bio->mddev->private;
1219 int i,d; 1219 int i,d;
1220 1220
1221 for (i=0; i<conf->copies; i++) 1221 for (i=0; i<conf->copies; i++)
@@ -1253,7 +1253,7 @@ static void end_sync_write(struct bio *bio, int error)
1253 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1253 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1254 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 1254 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1255 mddev_t *mddev = r10_bio->mddev; 1255 mddev_t *mddev = r10_bio->mddev;
1256 conf_t *conf = mddev_to_conf(mddev); 1256 conf_t *conf = mddev->private;
1257 int i,d; 1257 int i,d;
1258 1258
1259 for (i = 0; i < conf->copies; i++) 1259 for (i = 0; i < conf->copies; i++)
@@ -1300,7 +1300,7 @@ static void end_sync_write(struct bio *bio, int error)
1300 */ 1300 */
1301static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) 1301static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1302{ 1302{
1303 conf_t *conf = mddev_to_conf(mddev); 1303 conf_t *conf = mddev->private;
1304 int i, first; 1304 int i, first;
1305 struct bio *tbio, *fbio; 1305 struct bio *tbio, *fbio;
1306 1306
@@ -1400,7 +1400,7 @@ done:
1400 1400
1401static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) 1401static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1402{ 1402{
1403 conf_t *conf = mddev_to_conf(mddev); 1403 conf_t *conf = mddev->private;
1404 int i, d; 1404 int i, d;
1405 struct bio *bio, *wbio; 1405 struct bio *bio, *wbio;
1406 1406
@@ -1549,7 +1549,7 @@ static void raid10d(mddev_t *mddev)
1549 r10bio_t *r10_bio; 1549 r10bio_t *r10_bio;
1550 struct bio *bio; 1550 struct bio *bio;
1551 unsigned long flags; 1551 unsigned long flags;
1552 conf_t *conf = mddev_to_conf(mddev); 1552 conf_t *conf = mddev->private;
1553 struct list_head *head = &conf->retry_list; 1553 struct list_head *head = &conf->retry_list;
1554 int unplug=0; 1554 int unplug=0;
1555 mdk_rdev_t *rdev; 1555 mdk_rdev_t *rdev;
@@ -1572,7 +1572,7 @@ static void raid10d(mddev_t *mddev)
1572 spin_unlock_irqrestore(&conf->device_lock, flags); 1572 spin_unlock_irqrestore(&conf->device_lock, flags);
1573 1573
1574 mddev = r10_bio->mddev; 1574 mddev = r10_bio->mddev;
1575 conf = mddev_to_conf(mddev); 1575 conf = mddev->private;
1576 if (test_bit(R10BIO_IsSync, &r10_bio->state)) { 1576 if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
1577 sync_request_write(mddev, r10_bio); 1577 sync_request_write(mddev, r10_bio);
1578 unplug = 1; 1578 unplug = 1;
@@ -1680,7 +1680,7 @@ static int init_resync(conf_t *conf)
1680 1680
1681static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 1681static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1682{ 1682{
1683 conf_t *conf = mddev_to_conf(mddev); 1683 conf_t *conf = mddev->private;
1684 r10bio_t *r10_bio; 1684 r10bio_t *r10_bio;
1685 struct bio *biolist = NULL, *bio; 1685 struct bio *biolist = NULL, *bio;
1686 sector_t max_sector, nr_sectors; 1686 sector_t max_sector, nr_sectors;
@@ -2026,7 +2026,7 @@ static sector_t
2026raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) 2026raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2027{ 2027{
2028 sector_t size; 2028 sector_t size;
2029 conf_t *conf = mddev_to_conf(mddev); 2029 conf_t *conf = mddev->private;
2030 2030
2031 if (!raid_disks) 2031 if (!raid_disks)
2032 raid_disks = mddev->raid_disks; 2032 raid_disks = mddev->raid_disks;
@@ -2050,9 +2050,10 @@ static int run(mddev_t *mddev)
2050 int nc, fc, fo; 2050 int nc, fc, fo;
2051 sector_t stride, size; 2051 sector_t stride, size;
2052 2052
2053 if (mddev->chunk_size < PAGE_SIZE) { 2053 if (mddev->chunk_sectors < (PAGE_SIZE >> 9) ||
2054 !is_power_of_2(mddev->chunk_sectors)) {
2054 printk(KERN_ERR "md/raid10: chunk size must be " 2055 printk(KERN_ERR "md/raid10: chunk size must be "
2055 "at least PAGE_SIZE(%ld).\n", PAGE_SIZE); 2056 "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE);
2056 return -EINVAL; 2057 return -EINVAL;
2057 } 2058 }
2058 2059
@@ -2095,8 +2096,8 @@ static int run(mddev_t *mddev)
2095 conf->far_copies = fc; 2096 conf->far_copies = fc;
2096 conf->copies = nc*fc; 2097 conf->copies = nc*fc;
2097 conf->far_offset = fo; 2098 conf->far_offset = fo;
2098 conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; 2099 conf->chunk_mask = mddev->chunk_sectors - 1;
2099 conf->chunk_shift = ffz(~mddev->chunk_size) - 9; 2100 conf->chunk_shift = ffz(~mddev->chunk_sectors);
2100 size = mddev->dev_sectors >> conf->chunk_shift; 2101 size = mddev->dev_sectors >> conf->chunk_shift;
2101 sector_div(size, fc); 2102 sector_div(size, fc);
2102 size = size * conf->raid_disks; 2103 size = size * conf->raid_disks;
@@ -2145,8 +2146,8 @@ static int run(mddev_t *mddev)
2145 * a one page request is never in violation. 2146 * a one page request is never in violation.
2146 */ 2147 */
2147 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 2148 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
2148 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 2149 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
2149 mddev->queue->max_sectors = (PAGE_SIZE>>9); 2150 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
2150 2151
2151 disk->head_position = 0; 2152 disk->head_position = 0;
2152 } 2153 }
@@ -2185,6 +2186,10 @@ static int run(mddev_t *mddev)
2185 goto out_free_conf; 2186 goto out_free_conf;
2186 } 2187 }
2187 2188
2189 if (mddev->recovery_cp != MaxSector)
2190 printk(KERN_NOTICE "raid10: %s is not clean"
2191 " -- starting background reconstruction\n",
2192 mdname(mddev));
2188 printk(KERN_INFO 2193 printk(KERN_INFO
2189 "raid10: raid set %s active with %d out of %d devices\n", 2194 "raid10: raid set %s active with %d out of %d devices\n",
2190 mdname(mddev), mddev->raid_disks - mddev->degraded, 2195 mdname(mddev), mddev->raid_disks - mddev->degraded,
@@ -2204,7 +2209,8 @@ static int run(mddev_t *mddev)
2204 * maybe... 2209 * maybe...
2205 */ 2210 */
2206 { 2211 {
2207 int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE); 2212 int stripe = conf->raid_disks *
2213 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
2208 stripe /= conf->near_copies; 2214 stripe /= conf->near_copies;
2209 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) 2215 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
2210 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 2216 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
@@ -2227,7 +2233,7 @@ out:
2227 2233
2228static int stop(mddev_t *mddev) 2234static int stop(mddev_t *mddev)
2229{ 2235{
2230 conf_t *conf = mddev_to_conf(mddev); 2236 conf_t *conf = mddev->private;
2231 2237
2232 raise_barrier(conf, 0); 2238 raise_barrier(conf, 0);
2233 lower_barrier(conf); 2239 lower_barrier(conf);
@@ -2245,7 +2251,7 @@ static int stop(mddev_t *mddev)
2245 2251
2246static void raid10_quiesce(mddev_t *mddev, int state) 2252static void raid10_quiesce(mddev_t *mddev, int state)
2247{ 2253{
2248 conf_t *conf = mddev_to_conf(mddev); 2254 conf_t *conf = mddev->private;
2249 2255
2250 switch(state) { 2256 switch(state) {
2251 case 1: 2257 case 1:
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 244dbe507a54..59cd1efb8d30 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -62,12 +62,6 @@ struct r10_private_data_s {
62typedef struct r10_private_data_s conf_t; 62typedef struct r10_private_data_s conf_t;
63 63
64/* 64/*
65 * this is the only point in the RAID code where we violate
66 * C type safety. mddev->private is an 'opaque' pointer.
67 */
68#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
69
70/*
71 * this is our 'private' RAID10 bio. 65 * this is our 'private' RAID10 bio.
72 * 66 *
73 * it contains information about what kind of IO operations were started 67 * it contains information about what kind of IO operations were started
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 54ef8d75541d..cac6f4d3a143 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1617,8 +1617,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1617 sector_t new_sector; 1617 sector_t new_sector;
1618 int algorithm = previous ? conf->prev_algo 1618 int algorithm = previous ? conf->prev_algo
1619 : conf->algorithm; 1619 : conf->algorithm;
1620 int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) 1620 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1621 : (conf->chunk_size >> 9); 1621 : conf->chunk_sectors;
1622 int raid_disks = previous ? conf->previous_raid_disks 1622 int raid_disks = previous ? conf->previous_raid_disks
1623 : conf->raid_disks; 1623 : conf->raid_disks;
1624 int data_disks = raid_disks - conf->max_degraded; 1624 int data_disks = raid_disks - conf->max_degraded;
@@ -1823,8 +1823,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1823 int raid_disks = sh->disks; 1823 int raid_disks = sh->disks;
1824 int data_disks = raid_disks - conf->max_degraded; 1824 int data_disks = raid_disks - conf->max_degraded;
1825 sector_t new_sector = sh->sector, check; 1825 sector_t new_sector = sh->sector, check;
1826 int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) 1826 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1827 : (conf->chunk_size >> 9); 1827 : conf->chunk_sectors;
1828 int algorithm = previous ? conf->prev_algo 1828 int algorithm = previous ? conf->prev_algo
1829 : conf->algorithm; 1829 : conf->algorithm;
1830 sector_t stripe; 1830 sector_t stripe;
@@ -2098,8 +2098,7 @@ static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
2098 struct stripe_head *sh) 2098 struct stripe_head *sh)
2099{ 2099{
2100 int sectors_per_chunk = 2100 int sectors_per_chunk =
2101 previous ? (conf->prev_chunk >> 9) 2101 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
2102 : (conf->chunk_size >> 9);
2103 int dd_idx; 2102 int dd_idx;
2104 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2103 int chunk_offset = sector_div(stripe, sectors_per_chunk);
2105 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2104 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
@@ -3496,7 +3495,7 @@ static void activate_bit_delay(raid5_conf_t *conf)
3496 3495
3497static void unplug_slaves(mddev_t *mddev) 3496static void unplug_slaves(mddev_t *mddev)
3498{ 3497{
3499 raid5_conf_t *conf = mddev_to_conf(mddev); 3498 raid5_conf_t *conf = mddev->private;
3500 int i; 3499 int i;
3501 3500
3502 rcu_read_lock(); 3501 rcu_read_lock();
@@ -3520,7 +3519,7 @@ static void unplug_slaves(mddev_t *mddev)
3520static void raid5_unplug_device(struct request_queue *q) 3519static void raid5_unplug_device(struct request_queue *q)
3521{ 3520{
3522 mddev_t *mddev = q->queuedata; 3521 mddev_t *mddev = q->queuedata;
3523 raid5_conf_t *conf = mddev_to_conf(mddev); 3522 raid5_conf_t *conf = mddev->private;
3524 unsigned long flags; 3523 unsigned long flags;
3525 3524
3526 spin_lock_irqsave(&conf->device_lock, flags); 3525 spin_lock_irqsave(&conf->device_lock, flags);
@@ -3539,7 +3538,7 @@ static void raid5_unplug_device(struct request_queue *q)
3539static int raid5_congested(void *data, int bits) 3538static int raid5_congested(void *data, int bits)
3540{ 3539{
3541 mddev_t *mddev = data; 3540 mddev_t *mddev = data;
3542 raid5_conf_t *conf = mddev_to_conf(mddev); 3541 raid5_conf_t *conf = mddev->private;
3543 3542
3544 /* No difference between reads and writes. Just check 3543 /* No difference between reads and writes. Just check
3545 * how busy the stripe_cache is 3544 * how busy the stripe_cache is
@@ -3564,14 +3563,14 @@ static int raid5_mergeable_bvec(struct request_queue *q,
3564 mddev_t *mddev = q->queuedata; 3563 mddev_t *mddev = q->queuedata;
3565 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3564 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
3566 int max; 3565 int max;
3567 unsigned int chunk_sectors = mddev->chunk_size >> 9; 3566 unsigned int chunk_sectors = mddev->chunk_sectors;
3568 unsigned int bio_sectors = bvm->bi_size >> 9; 3567 unsigned int bio_sectors = bvm->bi_size >> 9;
3569 3568
3570 if ((bvm->bi_rw & 1) == WRITE) 3569 if ((bvm->bi_rw & 1) == WRITE)
3571 return biovec->bv_len; /* always allow writes to be mergeable */ 3570 return biovec->bv_len; /* always allow writes to be mergeable */
3572 3571
3573 if (mddev->new_chunk < mddev->chunk_size) 3572 if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3574 chunk_sectors = mddev->new_chunk >> 9; 3573 chunk_sectors = mddev->new_chunk_sectors;
3575 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3574 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
3576 if (max < 0) max = 0; 3575 if (max < 0) max = 0;
3577 if (max <= biovec->bv_len && bio_sectors == 0) 3576 if (max <= biovec->bv_len && bio_sectors == 0)
@@ -3584,11 +3583,11 @@ static int raid5_mergeable_bvec(struct request_queue *q,
3584static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) 3583static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
3585{ 3584{
3586 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3585 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3587 unsigned int chunk_sectors = mddev->chunk_size >> 9; 3586 unsigned int chunk_sectors = mddev->chunk_sectors;
3588 unsigned int bio_sectors = bio->bi_size >> 9; 3587 unsigned int bio_sectors = bio->bi_size >> 9;
3589 3588
3590 if (mddev->new_chunk < mddev->chunk_size) 3589 if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3591 chunk_sectors = mddev->new_chunk >> 9; 3590 chunk_sectors = mddev->new_chunk_sectors;
3592 return chunk_sectors >= 3591 return chunk_sectors >=
3593 ((sector & (chunk_sectors - 1)) + bio_sectors); 3592 ((sector & (chunk_sectors - 1)) + bio_sectors);
3594} 3593}
@@ -3652,7 +3651,7 @@ static void raid5_align_endio(struct bio *bi, int error)
3652 bio_put(bi); 3651 bio_put(bi);
3653 3652
3654 mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; 3653 mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
3655 conf = mddev_to_conf(mddev); 3654 conf = mddev->private;
3656 rdev = (void*)raid_bi->bi_next; 3655 rdev = (void*)raid_bi->bi_next;
3657 raid_bi->bi_next = NULL; 3656 raid_bi->bi_next = NULL;
3658 3657
@@ -3675,10 +3674,10 @@ static int bio_fits_rdev(struct bio *bi)
3675{ 3674{
3676 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3675 struct request_queue *q = bdev_get_queue(bi->bi_bdev);
3677 3676
3678 if ((bi->bi_size>>9) > q->max_sectors) 3677 if ((bi->bi_size>>9) > queue_max_sectors(q))
3679 return 0; 3678 return 0;
3680 blk_recount_segments(q, bi); 3679 blk_recount_segments(q, bi);
3681 if (bi->bi_phys_segments > q->max_phys_segments) 3680 if (bi->bi_phys_segments > queue_max_phys_segments(q))
3682 return 0; 3681 return 0;
3683 3682
3684 if (q->merge_bvec_fn) 3683 if (q->merge_bvec_fn)
@@ -3694,7 +3693,7 @@ static int bio_fits_rdev(struct bio *bi)
3694static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) 3693static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3695{ 3694{
3696 mddev_t *mddev = q->queuedata; 3695 mddev_t *mddev = q->queuedata;
3697 raid5_conf_t *conf = mddev_to_conf(mddev); 3696 raid5_conf_t *conf = mddev->private;
3698 unsigned int dd_idx; 3697 unsigned int dd_idx;
3699 struct bio* align_bi; 3698 struct bio* align_bi;
3700 mdk_rdev_t *rdev; 3699 mdk_rdev_t *rdev;
@@ -3811,7 +3810,7 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
3811static int make_request(struct request_queue *q, struct bio * bi) 3810static int make_request(struct request_queue *q, struct bio * bi)
3812{ 3811{
3813 mddev_t *mddev = q->queuedata; 3812 mddev_t *mddev = q->queuedata;
3814 raid5_conf_t *conf = mddev_to_conf(mddev); 3813 raid5_conf_t *conf = mddev->private;
3815 int dd_idx; 3814 int dd_idx;
3816 sector_t new_sector; 3815 sector_t new_sector;
3817 sector_t logical_sector, last_sector; 3816 sector_t logical_sector, last_sector;
@@ -3908,6 +3907,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
3908 spin_unlock_irq(&conf->device_lock); 3907 spin_unlock_irq(&conf->device_lock);
3909 if (must_retry) { 3908 if (must_retry) {
3910 release_stripe(sh); 3909 release_stripe(sh);
3910 schedule();
3911 goto retry; 3911 goto retry;
3912 } 3912 }
3913 } 3913 }
@@ -4003,10 +4003,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4003 * If old and new chunk sizes differ, we need to process the 4003 * If old and new chunk sizes differ, we need to process the
4004 * largest of these 4004 * largest of these
4005 */ 4005 */
4006 if (mddev->new_chunk > mddev->chunk_size) 4006 if (mddev->new_chunk_sectors > mddev->chunk_sectors)
4007 reshape_sectors = mddev->new_chunk / 512; 4007 reshape_sectors = mddev->new_chunk_sectors;
4008 else 4008 else
4009 reshape_sectors = mddev->chunk_size / 512; 4009 reshape_sectors = mddev->chunk_sectors;
4010 4010
4011 /* we update the metadata when there is more than 3Meg 4011 /* we update the metadata when there is more than 3Meg
4012 * in the block range (that is rather arbitrary, should 4012 * in the block range (that is rather arbitrary, should
@@ -4129,7 +4129,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4129 1, &dd_idx, NULL); 4129 1, &dd_idx, NULL);
4130 last_sector = 4130 last_sector =
4131 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4131 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
4132 *(new_data_disks) - 1), 4132 * new_data_disks - 1),
4133 1, &dd_idx, NULL); 4133 1, &dd_idx, NULL);
4134 if (last_sector >= mddev->dev_sectors) 4134 if (last_sector >= mddev->dev_sectors)
4135 last_sector = mddev->dev_sectors - 1; 4135 last_sector = mddev->dev_sectors - 1;
@@ -4158,7 +4158,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4158 wait_event(conf->wait_for_overlap, 4158 wait_event(conf->wait_for_overlap,
4159 atomic_read(&conf->reshape_stripes) == 0); 4159 atomic_read(&conf->reshape_stripes) == 0);
4160 mddev->reshape_position = conf->reshape_progress; 4160 mddev->reshape_position = conf->reshape_progress;
4161 mddev->curr_resync_completed = mddev->curr_resync; 4161 mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
4162 conf->reshape_checkpoint = jiffies; 4162 conf->reshape_checkpoint = jiffies;
4163 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4163 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4164 md_wakeup_thread(mddev->thread); 4164 md_wakeup_thread(mddev->thread);
@@ -4371,7 +4371,7 @@ static void synchronize_stripe_processing(struct list_head *domain)
4371static void raid5d(mddev_t *mddev) 4371static void raid5d(mddev_t *mddev)
4372{ 4372{
4373 struct stripe_head *sh; 4373 struct stripe_head *sh;
4374 raid5_conf_t *conf = mddev_to_conf(mddev); 4374 raid5_conf_t *conf = mddev->private;
4375 int handled; 4375 int handled;
4376 LIST_HEAD(raid_domain); 4376 LIST_HEAD(raid_domain);
4377 4377
@@ -4428,7 +4428,7 @@ static void raid5d(mddev_t *mddev)
4428static ssize_t 4428static ssize_t
4429raid5_show_stripe_cache_size(mddev_t *mddev, char *page) 4429raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
4430{ 4430{
4431 raid5_conf_t *conf = mddev_to_conf(mddev); 4431 raid5_conf_t *conf = mddev->private;
4432 if (conf) 4432 if (conf)
4433 return sprintf(page, "%d\n", conf->max_nr_stripes); 4433 return sprintf(page, "%d\n", conf->max_nr_stripes);
4434 else 4434 else
@@ -4438,7 +4438,7 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
4438static ssize_t 4438static ssize_t
4439raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) 4439raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4440{ 4440{
4441 raid5_conf_t *conf = mddev_to_conf(mddev); 4441 raid5_conf_t *conf = mddev->private;
4442 unsigned long new; 4442 unsigned long new;
4443 int err; 4443 int err;
4444 4444
@@ -4476,7 +4476,7 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
4476static ssize_t 4476static ssize_t
4477raid5_show_preread_threshold(mddev_t *mddev, char *page) 4477raid5_show_preread_threshold(mddev_t *mddev, char *page)
4478{ 4478{
4479 raid5_conf_t *conf = mddev_to_conf(mddev); 4479 raid5_conf_t *conf = mddev->private;
4480 if (conf) 4480 if (conf)
4481 return sprintf(page, "%d\n", conf->bypass_threshold); 4481 return sprintf(page, "%d\n", conf->bypass_threshold);
4482 else 4482 else
@@ -4486,7 +4486,7 @@ raid5_show_preread_threshold(mddev_t *mddev, char *page)
4486static ssize_t 4486static ssize_t
4487raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) 4487raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
4488{ 4488{
4489 raid5_conf_t *conf = mddev_to_conf(mddev); 4489 raid5_conf_t *conf = mddev->private;
4490 unsigned long new; 4490 unsigned long new;
4491 if (len >= PAGE_SIZE) 4491 if (len >= PAGE_SIZE)
4492 return -EINVAL; 4492 return -EINVAL;
@@ -4510,7 +4510,7 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
4510static ssize_t 4510static ssize_t
4511stripe_cache_active_show(mddev_t *mddev, char *page) 4511stripe_cache_active_show(mddev_t *mddev, char *page)
4512{ 4512{
4513 raid5_conf_t *conf = mddev_to_conf(mddev); 4513 raid5_conf_t *conf = mddev->private;
4514 if (conf) 4514 if (conf)
4515 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4515 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
4516 else 4516 else
@@ -4534,7 +4534,7 @@ static struct attribute_group raid5_attrs_group = {
4534static sector_t 4534static sector_t
4535raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) 4535raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4536{ 4536{
4537 raid5_conf_t *conf = mddev_to_conf(mddev); 4537 raid5_conf_t *conf = mddev->private;
4538 4538
4539 if (!sectors) 4539 if (!sectors)
4540 sectors = mddev->dev_sectors; 4540 sectors = mddev->dev_sectors;
@@ -4546,8 +4546,8 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4546 raid_disks = conf->previous_raid_disks; 4546 raid_disks = conf->previous_raid_disks;
4547 } 4547 }
4548 4548
4549 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 4549 sectors &= ~((sector_t)mddev->chunk_sectors - 1);
4550 sectors &= ~((sector_t)mddev->new_chunk/512 - 1); 4550 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
4551 return sectors * (raid_disks - conf->max_degraded); 4551 return sectors * (raid_disks - conf->max_degraded);
4552} 4552}
4553 4553
@@ -4691,9 +4691,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4691 return ERR_PTR(-EINVAL); 4691 return ERR_PTR(-EINVAL);
4692 } 4692 }
4693 4693
4694 if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) { 4694 if (!mddev->new_chunk_sectors ||
4695 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
4696 !is_power_of_2(mddev->new_chunk_sectors)) {
4695 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 4697 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
4696 mddev->new_chunk, mdname(mddev)); 4698 mddev->new_chunk_sectors << 9, mdname(mddev));
4697 return ERR_PTR(-EINVAL); 4699 return ERR_PTR(-EINVAL);
4698 } 4700 }
4699 4701
@@ -4756,7 +4758,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4756 conf->fullsync = 1; 4758 conf->fullsync = 1;
4757 } 4759 }
4758 4760
4759 conf->chunk_size = mddev->new_chunk; 4761 conf->chunk_sectors = mddev->new_chunk_sectors;
4762 conf->level = mddev->new_level;
4760 if (conf->level == 6) 4763 if (conf->level == 6)
4761 conf->max_degraded = 2; 4764 conf->max_degraded = 2;
4762 else 4765 else
@@ -4765,7 +4768,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4765 conf->max_nr_stripes = NR_STRIPES; 4768 conf->max_nr_stripes = NR_STRIPES;
4766 conf->reshape_progress = mddev->reshape_position; 4769 conf->reshape_progress = mddev->reshape_position;
4767 if (conf->reshape_progress != MaxSector) { 4770 if (conf->reshape_progress != MaxSector) {
4768 conf->prev_chunk = mddev->chunk_size; 4771 conf->prev_chunk_sectors = mddev->chunk_sectors;
4769 conf->prev_algo = mddev->layout; 4772 conf->prev_algo = mddev->layout;
4770 } 4773 }
4771 4774
@@ -4803,6 +4806,10 @@ static int run(mddev_t *mddev)
4803 int working_disks = 0; 4806 int working_disks = 0;
4804 mdk_rdev_t *rdev; 4807 mdk_rdev_t *rdev;
4805 4808
4809 if (mddev->recovery_cp != MaxSector)
4810 printk(KERN_NOTICE "raid5: %s is not clean"
4811 " -- starting background reconstruction\n",
4812 mdname(mddev));
4806 if (mddev->reshape_position != MaxSector) { 4813 if (mddev->reshape_position != MaxSector) {
4807 /* Check that we can continue the reshape. 4814 /* Check that we can continue the reshape.
4808 * Currently only disks can change, it must 4815 * Currently only disks can change, it must
@@ -4825,7 +4832,7 @@ static int run(mddev_t *mddev)
4825 * geometry. 4832 * geometry.
4826 */ 4833 */
4827 here_new = mddev->reshape_position; 4834 here_new = mddev->reshape_position;
4828 if (sector_div(here_new, (mddev->new_chunk>>9)* 4835 if (sector_div(here_new, mddev->new_chunk_sectors *
4829 (mddev->raid_disks - max_degraded))) { 4836 (mddev->raid_disks - max_degraded))) {
4830 printk(KERN_ERR "raid5: reshape_position not " 4837 printk(KERN_ERR "raid5: reshape_position not "
4831 "on a stripe boundary\n"); 4838 "on a stripe boundary\n");
@@ -4833,7 +4840,7 @@ static int run(mddev_t *mddev)
4833 } 4840 }
4834 /* here_new is the stripe we will write to */ 4841 /* here_new is the stripe we will write to */
4835 here_old = mddev->reshape_position; 4842 here_old = mddev->reshape_position;
4836 sector_div(here_old, (mddev->chunk_size>>9)* 4843 sector_div(here_old, mddev->chunk_sectors *
4837 (old_disks-max_degraded)); 4844 (old_disks-max_degraded));
4838 /* here_old is the first stripe that we might need to read 4845 /* here_old is the first stripe that we might need to read
4839 * from */ 4846 * from */
@@ -4848,7 +4855,7 @@ static int run(mddev_t *mddev)
4848 } else { 4855 } else {
4849 BUG_ON(mddev->level != mddev->new_level); 4856 BUG_ON(mddev->level != mddev->new_level);
4850 BUG_ON(mddev->layout != mddev->new_layout); 4857 BUG_ON(mddev->layout != mddev->new_layout);
4851 BUG_ON(mddev->chunk_size != mddev->new_chunk); 4858 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
4852 BUG_ON(mddev->delta_disks != 0); 4859 BUG_ON(mddev->delta_disks != 0);
4853 } 4860 }
4854 4861
@@ -4882,7 +4889,7 @@ static int run(mddev_t *mddev)
4882 } 4889 }
4883 4890
4884 /* device size must be a multiple of chunk size */ 4891 /* device size must be a multiple of chunk size */
4885 mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); 4892 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
4886 mddev->resync_max_sectors = mddev->dev_sectors; 4893 mddev->resync_max_sectors = mddev->dev_sectors;
4887 4894
4888 if (mddev->degraded > 0 && 4895 if (mddev->degraded > 0 &&
@@ -4931,7 +4938,7 @@ static int run(mddev_t *mddev)
4931 { 4938 {
4932 int data_disks = conf->previous_raid_disks - conf->max_degraded; 4939 int data_disks = conf->previous_raid_disks - conf->max_degraded;
4933 int stripe = data_disks * 4940 int stripe = data_disks *
4934 (mddev->chunk_size / PAGE_SIZE); 4941 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
4935 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 4942 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4936 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 4943 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4937 } 4944 }
@@ -5021,7 +5028,8 @@ static void status(struct seq_file *seq, mddev_t *mddev)
5021 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 5028 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
5022 int i; 5029 int i;
5023 5030
5024 seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); 5031 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
5032 mddev->chunk_sectors / 2, mddev->layout);
5025 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5033 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
5026 for (i = 0; i < conf->raid_disks; i++) 5034 for (i = 0; i < conf->raid_disks; i++)
5027 seq_printf (seq, "%s", 5035 seq_printf (seq, "%s",
@@ -5169,7 +5177,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
5169 * any io in the removed space completes, but it hardly seems 5177 * any io in the removed space completes, but it hardly seems
5170 * worth it. 5178 * worth it.
5171 */ 5179 */
5172 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 5180 sectors &= ~((sector_t)mddev->chunk_sectors - 1);
5173 md_set_array_sectors(mddev, raid5_size(mddev, sectors, 5181 md_set_array_sectors(mddev, raid5_size(mddev, sectors,
5174 mddev->raid_disks)); 5182 mddev->raid_disks));
5175 if (mddev->array_sectors > 5183 if (mddev->array_sectors >
@@ -5186,14 +5194,37 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
5186 return 0; 5194 return 0;
5187} 5195}
5188 5196
5189static int raid5_check_reshape(mddev_t *mddev) 5197static int check_stripe_cache(mddev_t *mddev)
5198{
5199 /* Can only proceed if there are plenty of stripe_heads.
5200 * We need a minimum of one full stripe,, and for sensible progress
5201 * it is best to have about 4 times that.
5202 * If we require 4 times, then the default 256 4K stripe_heads will
5203 * allow for chunk sizes up to 256K, which is probably OK.
5204 * If the chunk size is greater, user-space should request more
5205 * stripe_heads first.
5206 */
5207 raid5_conf_t *conf = mddev->private;
5208 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
5209 > conf->max_nr_stripes ||
5210 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
5211 > conf->max_nr_stripes) {
5212 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
5213 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
5214 / STRIPE_SIZE)*4);
5215 return 0;
5216 }
5217 return 1;
5218}
5219
5220static int check_reshape(mddev_t *mddev)
5190{ 5221{
5191 raid5_conf_t *conf = mddev_to_conf(mddev); 5222 raid5_conf_t *conf = mddev->private;
5192 5223
5193 if (mddev->delta_disks == 0 && 5224 if (mddev->delta_disks == 0 &&
5194 mddev->new_layout == mddev->layout && 5225 mddev->new_layout == mddev->layout &&
5195 mddev->new_chunk == mddev->chunk_size) 5226 mddev->new_chunk_sectors == mddev->chunk_sectors)
5196 return -EINVAL; /* nothing to do */ 5227 return 0; /* nothing to do */
5197 if (mddev->bitmap) 5228 if (mddev->bitmap)
5198 /* Cannot grow a bitmap yet */ 5229 /* Cannot grow a bitmap yet */
5199 return -EBUSY; 5230 return -EBUSY;
@@ -5212,28 +5243,15 @@ static int raid5_check_reshape(mddev_t *mddev)
5212 return -EINVAL; 5243 return -EINVAL;
5213 } 5244 }
5214 5245
5215 /* Can only proceed if there are plenty of stripe_heads. 5246 if (!check_stripe_cache(mddev))
5216 * We need a minimum of one full stripe,, and for sensible progress
5217 * it is best to have about 4 times that.
5218 * If we require 4 times, then the default 256 4K stripe_heads will
5219 * allow for chunk sizes up to 256K, which is probably OK.
5220 * If the chunk size is greater, user-space should request more
5221 * stripe_heads first.
5222 */
5223 if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
5224 (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
5225 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
5226 (max(mddev->chunk_size, mddev->new_chunk)
5227 / STRIPE_SIZE)*4);
5228 return -ENOSPC; 5247 return -ENOSPC;
5229 }
5230 5248
5231 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 5249 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
5232} 5250}
5233 5251
5234static int raid5_start_reshape(mddev_t *mddev) 5252static int raid5_start_reshape(mddev_t *mddev)
5235{ 5253{
5236 raid5_conf_t *conf = mddev_to_conf(mddev); 5254 raid5_conf_t *conf = mddev->private;
5237 mdk_rdev_t *rdev; 5255 mdk_rdev_t *rdev;
5238 int spares = 0; 5256 int spares = 0;
5239 int added_devices = 0; 5257 int added_devices = 0;
@@ -5242,6 +5260,9 @@ static int raid5_start_reshape(mddev_t *mddev)
5242 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5260 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5243 return -EBUSY; 5261 return -EBUSY;
5244 5262
5263 if (!check_stripe_cache(mddev))
5264 return -ENOSPC;
5265
5245 list_for_each_entry(rdev, &mddev->disks, same_set) 5266 list_for_each_entry(rdev, &mddev->disks, same_set)
5246 if (rdev->raid_disk < 0 && 5267 if (rdev->raid_disk < 0 &&
5247 !test_bit(Faulty, &rdev->flags)) 5268 !test_bit(Faulty, &rdev->flags))
@@ -5268,8 +5289,8 @@ static int raid5_start_reshape(mddev_t *mddev)
5268 spin_lock_irq(&conf->device_lock); 5289 spin_lock_irq(&conf->device_lock);
5269 conf->previous_raid_disks = conf->raid_disks; 5290 conf->previous_raid_disks = conf->raid_disks;
5270 conf->raid_disks += mddev->delta_disks; 5291 conf->raid_disks += mddev->delta_disks;
5271 conf->prev_chunk = conf->chunk_size; 5292 conf->prev_chunk_sectors = conf->chunk_sectors;
5272 conf->chunk_size = mddev->new_chunk; 5293 conf->chunk_sectors = mddev->new_chunk_sectors;
5273 conf->prev_algo = conf->algorithm; 5294 conf->prev_algo = conf->algorithm;
5274 conf->algorithm = mddev->new_layout; 5295 conf->algorithm = mddev->new_layout;
5275 if (mddev->delta_disks < 0) 5296 if (mddev->delta_disks < 0)
@@ -5351,7 +5372,7 @@ static void end_reshape(raid5_conf_t *conf)
5351 */ 5372 */
5352 { 5373 {
5353 int data_disks = conf->raid_disks - conf->max_degraded; 5374 int data_disks = conf->raid_disks - conf->max_degraded;
5354 int stripe = data_disks * (conf->chunk_size 5375 int stripe = data_disks * ((conf->chunk_sectors << 9)
5355 / PAGE_SIZE); 5376 / PAGE_SIZE);
5356 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5377 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
5357 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5378 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
@@ -5365,7 +5386,7 @@ static void end_reshape(raid5_conf_t *conf)
5365static void raid5_finish_reshape(mddev_t *mddev) 5386static void raid5_finish_reshape(mddev_t *mddev)
5366{ 5387{
5367 struct block_device *bdev; 5388 struct block_device *bdev;
5368 raid5_conf_t *conf = mddev_to_conf(mddev); 5389 raid5_conf_t *conf = mddev->private;
5369 5390
5370 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5391 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5371 5392
@@ -5396,7 +5417,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
5396 raid5_remove_disk(mddev, d); 5417 raid5_remove_disk(mddev, d);
5397 } 5418 }
5398 mddev->layout = conf->algorithm; 5419 mddev->layout = conf->algorithm;
5399 mddev->chunk_size = conf->chunk_size; 5420 mddev->chunk_sectors = conf->chunk_sectors;
5400 mddev->reshape_position = MaxSector; 5421 mddev->reshape_position = MaxSector;
5401 mddev->delta_disks = 0; 5422 mddev->delta_disks = 0;
5402 } 5423 }
@@ -5404,7 +5425,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
5404 5425
5405static void raid5_quiesce(mddev_t *mddev, int state) 5426static void raid5_quiesce(mddev_t *mddev, int state)
5406{ 5427{
5407 raid5_conf_t *conf = mddev_to_conf(mddev); 5428 raid5_conf_t *conf = mddev->private;
5408 5429
5409 switch(state) { 5430 switch(state) {
5410 case 2: /* resume for a suspend */ 5431 case 2: /* resume for a suspend */
@@ -5454,7 +5475,7 @@ static void *raid5_takeover_raid1(mddev_t *mddev)
5454 5475
5455 mddev->new_level = 5; 5476 mddev->new_level = 5;
5456 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 5477 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
5457 mddev->new_chunk = chunksect << 9; 5478 mddev->new_chunk_sectors = chunksect;
5458 5479
5459 return setup_conf(mddev); 5480 return setup_conf(mddev);
5460} 5481}
@@ -5493,24 +5514,24 @@ static void *raid5_takeover_raid6(mddev_t *mddev)
5493} 5514}
5494 5515
5495 5516
5496static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) 5517static int raid5_check_reshape(mddev_t *mddev)
5497{ 5518{
5498 /* For a 2-drive array, the layout and chunk size can be changed 5519 /* For a 2-drive array, the layout and chunk size can be changed
5499 * immediately as not restriping is needed. 5520 * immediately as not restriping is needed.
5500 * For larger arrays we record the new value - after validation 5521 * For larger arrays we record the new value - after validation
5501 * to be used by a reshape pass. 5522 * to be used by a reshape pass.
5502 */ 5523 */
5503 raid5_conf_t *conf = mddev_to_conf(mddev); 5524 raid5_conf_t *conf = mddev->private;
5525 int new_chunk = mddev->new_chunk_sectors;
5504 5526
5505 if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) 5527 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
5506 return -EINVAL; 5528 return -EINVAL;
5507 if (new_chunk > 0) { 5529 if (new_chunk > 0) {
5508 if (new_chunk & (new_chunk-1)) 5530 if (!is_power_of_2(new_chunk))
5509 /* not a power of 2 */
5510 return -EINVAL; 5531 return -EINVAL;
5511 if (new_chunk < PAGE_SIZE) 5532 if (new_chunk < (PAGE_SIZE>>9))
5512 return -EINVAL; 5533 return -EINVAL;
5513 if (mddev->array_sectors & ((new_chunk>>9)-1)) 5534 if (mddev->array_sectors & (new_chunk-1))
5514 /* not factor of array size */ 5535 /* not factor of array size */
5515 return -EINVAL; 5536 return -EINVAL;
5516 } 5537 }
@@ -5518,49 +5539,39 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
5518 /* They look valid */ 5539 /* They look valid */
5519 5540
5520 if (mddev->raid_disks == 2) { 5541 if (mddev->raid_disks == 2) {
5521 5542 /* can make the change immediately */
5522 if (new_layout >= 0) { 5543 if (mddev->new_layout >= 0) {
5523 conf->algorithm = new_layout; 5544 conf->algorithm = mddev->new_layout;
5524 mddev->layout = mddev->new_layout = new_layout; 5545 mddev->layout = mddev->new_layout;
5525 } 5546 }
5526 if (new_chunk > 0) { 5547 if (new_chunk > 0) {
5527 conf->chunk_size = new_chunk; 5548 conf->chunk_sectors = new_chunk ;
5528 mddev->chunk_size = mddev->new_chunk = new_chunk; 5549 mddev->chunk_sectors = new_chunk;
5529 } 5550 }
5530 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5551 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5531 md_wakeup_thread(mddev->thread); 5552 md_wakeup_thread(mddev->thread);
5532 } else {
5533 if (new_layout >= 0)
5534 mddev->new_layout = new_layout;
5535 if (new_chunk > 0)
5536 mddev->new_chunk = new_chunk;
5537 } 5553 }
5538 return 0; 5554 return check_reshape(mddev);
5539} 5555}
5540 5556
5541static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk) 5557static int raid6_check_reshape(mddev_t *mddev)
5542{ 5558{
5543 if (new_layout >= 0 && !algorithm_valid_raid6(new_layout)) 5559 int new_chunk = mddev->new_chunk_sectors;
5560
5561 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
5544 return -EINVAL; 5562 return -EINVAL;
5545 if (new_chunk > 0) { 5563 if (new_chunk > 0) {
5546 if (new_chunk & (new_chunk-1)) 5564 if (!is_power_of_2(new_chunk))
5547 /* not a power of 2 */
5548 return -EINVAL; 5565 return -EINVAL;
5549 if (new_chunk < PAGE_SIZE) 5566 if (new_chunk < (PAGE_SIZE >> 9))
5550 return -EINVAL; 5567 return -EINVAL;
5551 if (mddev->array_sectors & ((new_chunk>>9)-1)) 5568 if (mddev->array_sectors & (new_chunk-1))
5552 /* not factor of array size */ 5569 /* not factor of array size */
5553 return -EINVAL; 5570 return -EINVAL;
5554 } 5571 }
5555 5572
5556 /* They look valid */ 5573 /* They look valid */
5557 5574 return check_reshape(mddev);
5558 if (new_layout >= 0)
5559 mddev->new_layout = new_layout;
5560 if (new_chunk > 0)
5561 mddev->new_chunk = new_chunk;
5562
5563 return 0;
5564} 5575}
5565 5576
5566static void *raid5_takeover(mddev_t *mddev) 5577static void *raid5_takeover(mddev_t *mddev)
@@ -5570,8 +5581,6 @@ static void *raid5_takeover(mddev_t *mddev)
5570 * raid1 - if there are two drives. We need to know the chunk size 5581 * raid1 - if there are two drives. We need to know the chunk size
5571 * raid4 - trivial - just use a raid4 layout. 5582 * raid4 - trivial - just use a raid4 layout.
5572 * raid6 - Providing it is a *_6 layout 5583 * raid6 - Providing it is a *_6 layout
5573 *
5574 * For now, just do raid1
5575 */ 5584 */
5576 5585
5577 if (mddev->level == 1) 5586 if (mddev->level == 1)
@@ -5653,12 +5662,11 @@ static struct mdk_personality raid6_personality =
5653 .sync_request = sync_request, 5662 .sync_request = sync_request,
5654 .resize = raid5_resize, 5663 .resize = raid5_resize,
5655 .size = raid5_size, 5664 .size = raid5_size,
5656 .check_reshape = raid5_check_reshape, 5665 .check_reshape = raid6_check_reshape,
5657 .start_reshape = raid5_start_reshape, 5666 .start_reshape = raid5_start_reshape,
5658 .finish_reshape = raid5_finish_reshape, 5667 .finish_reshape = raid5_finish_reshape,
5659 .quiesce = raid5_quiesce, 5668 .quiesce = raid5_quiesce,
5660 .takeover = raid6_takeover, 5669 .takeover = raid6_takeover,
5661 .reconfig = raid6_reconfig,
5662}; 5670};
5663static struct mdk_personality raid5_personality = 5671static struct mdk_personality raid5_personality =
5664{ 5672{
@@ -5681,7 +5689,6 @@ static struct mdk_personality raid5_personality =
5681 .finish_reshape = raid5_finish_reshape, 5689 .finish_reshape = raid5_finish_reshape,
5682 .quiesce = raid5_quiesce, 5690 .quiesce = raid5_quiesce,
5683 .takeover = raid5_takeover, 5691 .takeover = raid5_takeover,
5684 .reconfig = raid5_reconfig,
5685}; 5692};
5686 5693
5687static struct mdk_personality raid4_personality = 5694static struct mdk_personality raid4_personality =
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 116d0b44b2a9..2390e0e83daf 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -337,7 +337,8 @@ struct raid5_private_data {
337 struct hlist_head *stripe_hashtbl; 337 struct hlist_head *stripe_hashtbl;
338 mddev_t *mddev; 338 mddev_t *mddev;
339 struct disk_info *spare; 339 struct disk_info *spare;
340 int chunk_size, level, algorithm; 340 int chunk_sectors;
341 int level, algorithm;
341 int max_degraded; 342 int max_degraded;
342 int raid_disks; 343 int raid_disks;
343 int max_nr_stripes; 344 int max_nr_stripes;
@@ -353,7 +354,8 @@ struct raid5_private_data {
353 */ 354 */
354 sector_t reshape_safe; 355 sector_t reshape_safe;
355 int previous_raid_disks; 356 int previous_raid_disks;
356 int prev_chunk, prev_algo; 357 int prev_chunk_sectors;
358 int prev_algo;
357 short generation; /* increments with every reshape */ 359 short generation; /* increments with every reshape */
358 unsigned long reshape_checkpoint; /* Time we last updated 360 unsigned long reshape_checkpoint; /* Time we last updated
359 * metadata */ 361 * metadata */
@@ -424,8 +426,6 @@ struct raid5_private_data {
424 426
425typedef struct raid5_private_data raid5_conf_t; 427typedef struct raid5_private_data raid5_conf_t;
426 428
427#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
428
429/* 429/*
430 * Our supported algorithms 430 * Our supported algorithms
431 */ 431 */