aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorPaul Mundt <lethal@linux-sh.org>2011-08-08 00:45:28 -0400
committerPaul Mundt <lethal@linux-sh.org>2011-08-08 00:45:28 -0400
commit77c7ee51a062bb595c501ec098125a68999c20c3 (patch)
treec5060ca5786ef353e005dae04b61d2c49967284d /drivers/md
parent1ba762209491e2496e58baffa3fd65d661f54404 (diff)
parent322a8b034003c0d46d39af85bf24fee27b902f48 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux into sh-latest
Conflicts: drivers/tty/serial/sh-sci.c Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig5
-rw-r--r--drivers/md/dm-crypt.c62
-rw-r--r--drivers/md/dm-flakey.c270
-rw-r--r--drivers/md/dm-io.c29
-rw-r--r--drivers/md/dm-ioctl.c89
-rw-r--r--drivers/md/dm-kcopyd.c42
-rw-r--r--drivers/md/dm-log-userspace-base.c3
-rw-r--r--drivers/md/dm-log.c32
-rw-r--r--drivers/md/dm-mpath.c147
-rw-r--r--drivers/md/dm-raid.c621
-rw-r--r--drivers/md/dm-snap-persistent.c80
-rw-r--r--drivers/md/dm-snap.c84
-rw-r--r--drivers/md/dm-table.c155
-rw-r--r--drivers/md/dm.c75
-rw-r--r--drivers/md/dm.h2
15 files changed, 1351 insertions, 345 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 8420129fc5ee..f75a66e7d312 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -241,12 +241,13 @@ config DM_MIRROR
241 needed for live data migration tools such as 'pvmove'. 241 needed for live data migration tools such as 'pvmove'.
242 242
243config DM_RAID 243config DM_RAID
244 tristate "RAID 4/5/6 target (EXPERIMENTAL)" 244 tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
245 depends on BLK_DEV_DM && EXPERIMENTAL 245 depends on BLK_DEV_DM && EXPERIMENTAL
246 select MD_RAID1
246 select MD_RAID456 247 select MD_RAID456
247 select BLK_DEV_MD 248 select BLK_DEV_MD
248 ---help--- 249 ---help---
249 A dm target that supports RAID4, RAID5 and RAID6 mappings 250 A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings
250 251
251 A RAID-5 set of N drives with a capacity of C MB per drive provides 252 A RAID-5 set of N drives with a capacity of C MB per drive provides
252 the capacity of C * (N - 1) MB, and protects against a failure 253 the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bae6c4e23d3f..49da55c1528a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -30,7 +30,6 @@
30#include <linux/device-mapper.h> 30#include <linux/device-mapper.h>
31 31
32#define DM_MSG_PREFIX "crypt" 32#define DM_MSG_PREFIX "crypt"
33#define MESG_STR(x) x, sizeof(x)
34 33
35/* 34/*
36 * context holding the current state of a multi-part conversion 35 * context holding the current state of a multi-part conversion
@@ -239,7 +238,7 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
239 struct dm_crypt_request *dmreq) 238 struct dm_crypt_request *dmreq)
240{ 239{
241 memset(iv, 0, cc->iv_size); 240 memset(iv, 0, cc->iv_size);
242 *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); 241 *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
243 242
244 return 0; 243 return 0;
245} 244}
@@ -248,7 +247,7 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
248 struct dm_crypt_request *dmreq) 247 struct dm_crypt_request *dmreq)
249{ 248{
250 memset(iv, 0, cc->iv_size); 249 memset(iv, 0, cc->iv_size);
251 *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); 250 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
252 251
253 return 0; 252 return 0;
254} 253}
@@ -415,7 +414,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
415 struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; 414 struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
416 415
417 memset(iv, 0, cc->iv_size); 416 memset(iv, 0, cc->iv_size);
418 *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); 417 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
419 crypto_cipher_encrypt_one(essiv_tfm, iv, iv); 418 crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
420 419
421 return 0; 420 return 0;
@@ -1575,11 +1574,17 @@ bad_mem:
1575static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) 1574static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1576{ 1575{
1577 struct crypt_config *cc; 1576 struct crypt_config *cc;
1578 unsigned int key_size; 1577 unsigned int key_size, opt_params;
1579 unsigned long long tmpll; 1578 unsigned long long tmpll;
1580 int ret; 1579 int ret;
1580 struct dm_arg_set as;
1581 const char *opt_string;
1582
1583 static struct dm_arg _args[] = {
1584 {0, 1, "Invalid number of feature args"},
1585 };
1581 1586
1582 if (argc != 5) { 1587 if (argc < 5) {
1583 ti->error = "Not enough arguments"; 1588 ti->error = "Not enough arguments";
1584 return -EINVAL; 1589 return -EINVAL;
1585 } 1590 }
@@ -1648,6 +1653,30 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1648 } 1653 }
1649 cc->start = tmpll; 1654 cc->start = tmpll;
1650 1655
1656 argv += 5;
1657 argc -= 5;
1658
1659 /* Optional parameters */
1660 if (argc) {
1661 as.argc = argc;
1662 as.argv = argv;
1663
1664 ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
1665 if (ret)
1666 goto bad;
1667
1668 opt_string = dm_shift_arg(&as);
1669
1670 if (opt_params == 1 && opt_string &&
1671 !strcasecmp(opt_string, "allow_discards"))
1672 ti->num_discard_requests = 1;
1673 else if (opt_params) {
1674 ret = -EINVAL;
1675 ti->error = "Invalid feature arguments";
1676 goto bad;
1677 }
1678 }
1679
1651 ret = -ENOMEM; 1680 ret = -ENOMEM;
1652 cc->io_queue = alloc_workqueue("kcryptd_io", 1681 cc->io_queue = alloc_workqueue("kcryptd_io",
1653 WQ_NON_REENTRANT| 1682 WQ_NON_REENTRANT|
@@ -1682,9 +1711,16 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1682 struct dm_crypt_io *io; 1711 struct dm_crypt_io *io;
1683 struct crypt_config *cc; 1712 struct crypt_config *cc;
1684 1713
1685 if (bio->bi_rw & REQ_FLUSH) { 1714 /*
1715 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
1716 * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight
1717 * - for REQ_DISCARD caller must use flush if IO ordering matters
1718 */
1719 if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
1686 cc = ti->private; 1720 cc = ti->private;
1687 bio->bi_bdev = cc->dev->bdev; 1721 bio->bi_bdev = cc->dev->bdev;
1722 if (bio_sectors(bio))
1723 bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
1688 return DM_MAPIO_REMAPPED; 1724 return DM_MAPIO_REMAPPED;
1689 } 1725 }
1690 1726
@@ -1727,6 +1763,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
1727 1763
1728 DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, 1764 DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
1729 cc->dev->name, (unsigned long long)cc->start); 1765 cc->dev->name, (unsigned long long)cc->start);
1766
1767 if (ti->num_discard_requests)
1768 DMEMIT(" 1 allow_discards");
1769
1730 break; 1770 break;
1731 } 1771 }
1732 return 0; 1772 return 0;
@@ -1770,12 +1810,12 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1770 if (argc < 2) 1810 if (argc < 2)
1771 goto error; 1811 goto error;
1772 1812
1773 if (!strnicmp(argv[0], MESG_STR("key"))) { 1813 if (!strcasecmp(argv[0], "key")) {
1774 if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { 1814 if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) {
1775 DMWARN("not suspended during key manipulation."); 1815 DMWARN("not suspended during key manipulation.");
1776 return -EINVAL; 1816 return -EINVAL;
1777 } 1817 }
1778 if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) { 1818 if (argc == 3 && !strcasecmp(argv[1], "set")) {
1779 ret = crypt_set_key(cc, argv[2]); 1819 ret = crypt_set_key(cc, argv[2]);
1780 if (ret) 1820 if (ret)
1781 return ret; 1821 return ret;
@@ -1783,7 +1823,7 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1783 ret = cc->iv_gen_ops->init(cc); 1823 ret = cc->iv_gen_ops->init(cc);
1784 return ret; 1824 return ret;
1785 } 1825 }
1786 if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) { 1826 if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
1787 if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { 1827 if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
1788 ret = cc->iv_gen_ops->wipe(cc); 1828 ret = cc->iv_gen_ops->wipe(cc);
1789 if (ret) 1829 if (ret)
@@ -1823,7 +1863,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
1823 1863
1824static struct target_type crypt_target = { 1864static struct target_type crypt_target = {
1825 .name = "crypt", 1865 .name = "crypt",
1826 .version = {1, 10, 0}, 1866 .version = {1, 11, 0},
1827 .module = THIS_MODULE, 1867 .module = THIS_MODULE,
1828 .ctr = crypt_ctr, 1868 .ctr = crypt_ctr,
1829 .dtr = crypt_dtr, 1869 .dtr = crypt_dtr,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index ea790623c30b..89f73ca22cfa 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2003 Sistina Software (UK) Limited. 2 * Copyright (C) 2003 Sistina Software (UK) Limited.
3 * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -15,6 +15,9 @@
15 15
16#define DM_MSG_PREFIX "flakey" 16#define DM_MSG_PREFIX "flakey"
17 17
18#define all_corrupt_bio_flags_match(bio, fc) \
19 (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags)
20
18/* 21/*
19 * Flakey: Used for testing only, simulates intermittent, 22 * Flakey: Used for testing only, simulates intermittent,
20 * catastrophic device failure. 23 * catastrophic device failure.
@@ -25,60 +28,189 @@ struct flakey_c {
25 sector_t start; 28 sector_t start;
26 unsigned up_interval; 29 unsigned up_interval;
27 unsigned down_interval; 30 unsigned down_interval;
31 unsigned long flags;
32 unsigned corrupt_bio_byte;
33 unsigned corrupt_bio_rw;
34 unsigned corrupt_bio_value;
35 unsigned corrupt_bio_flags;
36};
37
38enum feature_flag_bits {
39 DROP_WRITES
28}; 40};
29 41
42static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
43 struct dm_target *ti)
44{
45 int r;
46 unsigned argc;
47 const char *arg_name;
48
49 static struct dm_arg _args[] = {
50 {0, 6, "Invalid number of feature args"},
51 {1, UINT_MAX, "Invalid corrupt bio byte"},
52 {0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
53 {0, UINT_MAX, "Invalid corrupt bio flags mask"},
54 };
55
56 /* No feature arguments supplied. */
57 if (!as->argc)
58 return 0;
59
60 r = dm_read_arg_group(_args, as, &argc, &ti->error);
61 if (r)
62 return r;
63
64 while (argc) {
65 arg_name = dm_shift_arg(as);
66 argc--;
67
68 /*
69 * drop_writes
70 */
71 if (!strcasecmp(arg_name, "drop_writes")) {
72 if (test_and_set_bit(DROP_WRITES, &fc->flags)) {
73 ti->error = "Feature drop_writes duplicated";
74 return -EINVAL;
75 }
76
77 continue;
78 }
79
80 /*
81 * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>
82 */
83 if (!strcasecmp(arg_name, "corrupt_bio_byte")) {
84 if (!argc)
85 ti->error = "Feature corrupt_bio_byte requires parameters";
86
87 r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error);
88 if (r)
89 return r;
90 argc--;
91
92 /*
93 * Direction r or w?
94 */
95 arg_name = dm_shift_arg(as);
96 if (!strcasecmp(arg_name, "w"))
97 fc->corrupt_bio_rw = WRITE;
98 else if (!strcasecmp(arg_name, "r"))
99 fc->corrupt_bio_rw = READ;
100 else {
101 ti->error = "Invalid corrupt bio direction (r or w)";
102 return -EINVAL;
103 }
104 argc--;
105
106 /*
107 * Value of byte (0-255) to write in place of correct one.
108 */
109 r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error);
110 if (r)
111 return r;
112 argc--;
113
114 /*
115 * Only corrupt bios with these flags set.
116 */
117 r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error);
118 if (r)
119 return r;
120 argc--;
121
122 continue;
123 }
124
125 ti->error = "Unrecognised flakey feature requested";
126 return -EINVAL;
127 }
128
129 if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
130 ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
131 return -EINVAL;
132 }
133
134 return 0;
135}
136
30/* 137/*
31 * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval> 138 * Construct a flakey mapping:
139 * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*]
140 *
141 * Feature args:
142 * [drop_writes]
143 * [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>]
144 *
145 * Nth_byte starts from 1 for the first byte.
146 * Direction is r for READ or w for WRITE.
147 * bio_flags is ignored if 0.
32 */ 148 */
33static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) 149static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
34{ 150{
151 static struct dm_arg _args[] = {
152 {0, UINT_MAX, "Invalid up interval"},
153 {0, UINT_MAX, "Invalid down interval"},
154 };
155
156 int r;
35 struct flakey_c *fc; 157 struct flakey_c *fc;
36 unsigned long long tmp; 158 unsigned long long tmpll;
159 struct dm_arg_set as;
160 const char *devname;
37 161
38 if (argc != 4) { 162 as.argc = argc;
39 ti->error = "dm-flakey: Invalid argument count"; 163 as.argv = argv;
164
165 if (argc < 4) {
166 ti->error = "Invalid argument count";
40 return -EINVAL; 167 return -EINVAL;
41 } 168 }
42 169
43 fc = kmalloc(sizeof(*fc), GFP_KERNEL); 170 fc = kzalloc(sizeof(*fc), GFP_KERNEL);
44 if (!fc) { 171 if (!fc) {
45 ti->error = "dm-flakey: Cannot allocate linear context"; 172 ti->error = "Cannot allocate linear context";
46 return -ENOMEM; 173 return -ENOMEM;
47 } 174 }
48 fc->start_time = jiffies; 175 fc->start_time = jiffies;
49 176
50 if (sscanf(argv[1], "%llu", &tmp) != 1) { 177 devname = dm_shift_arg(&as);
51 ti->error = "dm-flakey: Invalid device sector"; 178
179 if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) {
180 ti->error = "Invalid device sector";
52 goto bad; 181 goto bad;
53 } 182 }
54 fc->start = tmp; 183 fc->start = tmpll;
55 184
56 if (sscanf(argv[2], "%u", &fc->up_interval) != 1) { 185 r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error);
57 ti->error = "dm-flakey: Invalid up interval"; 186 if (r)
58 goto bad; 187 goto bad;
59 }
60 188
61 if (sscanf(argv[3], "%u", &fc->down_interval) != 1) { 189 r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error);
62 ti->error = "dm-flakey: Invalid down interval"; 190 if (r)
63 goto bad; 191 goto bad;
64 }
65 192
66 if (!(fc->up_interval + fc->down_interval)) { 193 if (!(fc->up_interval + fc->down_interval)) {
67 ti->error = "dm-flakey: Total (up + down) interval is zero"; 194 ti->error = "Total (up + down) interval is zero";
68 goto bad; 195 goto bad;
69 } 196 }
70 197
71 if (fc->up_interval + fc->down_interval < fc->up_interval) { 198 if (fc->up_interval + fc->down_interval < fc->up_interval) {
72 ti->error = "dm-flakey: Interval overflow"; 199 ti->error = "Interval overflow";
73 goto bad; 200 goto bad;
74 } 201 }
75 202
76 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) { 203 r = parse_features(&as, fc, ti);
77 ti->error = "dm-flakey: Device lookup failed"; 204 if (r)
205 goto bad;
206
207 if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) {
208 ti->error = "Device lookup failed";
78 goto bad; 209 goto bad;
79 } 210 }
80 211
81 ti->num_flush_requests = 1; 212 ti->num_flush_requests = 1;
213 ti->num_discard_requests = 1;
82 ti->private = fc; 214 ti->private = fc;
83 return 0; 215 return 0;
84 216
@@ -99,7 +231,7 @@ static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector)
99{ 231{
100 struct flakey_c *fc = ti->private; 232 struct flakey_c *fc = ti->private;
101 233
102 return fc->start + (bi_sector - ti->begin); 234 return fc->start + dm_target_offset(ti, bi_sector);
103} 235}
104 236
105static void flakey_map_bio(struct dm_target *ti, struct bio *bio) 237static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
@@ -111,6 +243,25 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
111 bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); 243 bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
112} 244}
113 245
246static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
247{
248 unsigned bio_bytes = bio_cur_bytes(bio);
249 char *data = bio_data(bio);
250
251 /*
252 * Overwrite the Nth byte of the data returned.
253 */
254 if (data && bio_bytes >= fc->corrupt_bio_byte) {
255 data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
256
257 DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
258 "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
259 bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
260 (bio_data_dir(bio) == WRITE) ? 'w' : 'r',
261 bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes);
262 }
263}
264
114static int flakey_map(struct dm_target *ti, struct bio *bio, 265static int flakey_map(struct dm_target *ti, struct bio *bio,
115 union map_info *map_context) 266 union map_info *map_context)
116{ 267{
@@ -119,18 +270,71 @@ static int flakey_map(struct dm_target *ti, struct bio *bio,
119 270
120 /* Are we alive ? */ 271 /* Are we alive ? */
121 elapsed = (jiffies - fc->start_time) / HZ; 272 elapsed = (jiffies - fc->start_time) / HZ;
122 if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) 273 if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
274 /*
275 * Flag this bio as submitted while down.
276 */
277 map_context->ll = 1;
278
279 /*
280 * Map reads as normal.
281 */
282 if (bio_data_dir(bio) == READ)
283 goto map_bio;
284
285 /*
286 * Drop writes?
287 */
288 if (test_bit(DROP_WRITES, &fc->flags)) {
289 bio_endio(bio, 0);
290 return DM_MAPIO_SUBMITTED;
291 }
292
293 /*
294 * Corrupt matching writes.
295 */
296 if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) {
297 if (all_corrupt_bio_flags_match(bio, fc))
298 corrupt_bio_data(bio, fc);
299 goto map_bio;
300 }
301
302 /*
303 * By default, error all I/O.
304 */
123 return -EIO; 305 return -EIO;
306 }
124 307
308map_bio:
125 flakey_map_bio(ti, bio); 309 flakey_map_bio(ti, bio);
126 310
127 return DM_MAPIO_REMAPPED; 311 return DM_MAPIO_REMAPPED;
128} 312}
129 313
314static int flakey_end_io(struct dm_target *ti, struct bio *bio,
315 int error, union map_info *map_context)
316{
317 struct flakey_c *fc = ti->private;
318 unsigned bio_submitted_while_down = map_context->ll;
319
320 /*
321 * Corrupt successful READs while in down state.
322 * If flags were specified, only corrupt those that match.
323 */
324 if (!error && bio_submitted_while_down &&
325 (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
326 all_corrupt_bio_flags_match(bio, fc))
327 corrupt_bio_data(bio, fc);
328
329 return error;
330}
331
130static int flakey_status(struct dm_target *ti, status_type_t type, 332static int flakey_status(struct dm_target *ti, status_type_t type,
131 char *result, unsigned int maxlen) 333 char *result, unsigned int maxlen)
132{ 334{
335 unsigned sz = 0;
133 struct flakey_c *fc = ti->private; 336 struct flakey_c *fc = ti->private;
337 unsigned drop_writes;
134 338
135 switch (type) { 339 switch (type) {
136 case STATUSTYPE_INFO: 340 case STATUSTYPE_INFO:
@@ -138,9 +342,22 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
138 break; 342 break;
139 343
140 case STATUSTYPE_TABLE: 344 case STATUSTYPE_TABLE:
141 snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name, 345 DMEMIT("%s %llu %u %u ", fc->dev->name,
142 (unsigned long long)fc->start, fc->up_interval, 346 (unsigned long long)fc->start, fc->up_interval,
143 fc->down_interval); 347 fc->down_interval);
348
349 drop_writes = test_bit(DROP_WRITES, &fc->flags);
350 DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5);
351
352 if (drop_writes)
353 DMEMIT("drop_writes ");
354
355 if (fc->corrupt_bio_byte)
356 DMEMIT("corrupt_bio_byte %u %c %u %u ",
357 fc->corrupt_bio_byte,
358 (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r',
359 fc->corrupt_bio_value, fc->corrupt_bio_flags);
360
144 break; 361 break;
145 } 362 }
146 return 0; 363 return 0;
@@ -177,11 +394,12 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
177 394
178static struct target_type flakey_target = { 395static struct target_type flakey_target = {
179 .name = "flakey", 396 .name = "flakey",
180 .version = {1, 1, 0}, 397 .version = {1, 2, 0},
181 .module = THIS_MODULE, 398 .module = THIS_MODULE,
182 .ctr = flakey_ctr, 399 .ctr = flakey_ctr,
183 .dtr = flakey_dtr, 400 .dtr = flakey_dtr,
184 .map = flakey_map, 401 .map = flakey_map,
402 .end_io = flakey_end_io,
185 .status = flakey_status, 403 .status = flakey_status,
186 .ioctl = flakey_ioctl, 404 .ioctl = flakey_ioctl,
187 .merge = flakey_merge, 405 .merge = flakey_merge,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2067288f61f9..ad2eba40e319 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -38,6 +38,8 @@ struct io {
38 struct dm_io_client *client; 38 struct dm_io_client *client;
39 io_notify_fn callback; 39 io_notify_fn callback;
40 void *context; 40 void *context;
41 void *vma_invalidate_address;
42 unsigned long vma_invalidate_size;
41} __attribute__((aligned(DM_IO_MAX_REGIONS))); 43} __attribute__((aligned(DM_IO_MAX_REGIONS)));
42 44
43static struct kmem_cache *_dm_io_cache; 45static struct kmem_cache *_dm_io_cache;
@@ -116,6 +118,10 @@ static void dec_count(struct io *io, unsigned int region, int error)
116 set_bit(region, &io->error_bits); 118 set_bit(region, &io->error_bits);
117 119
118 if (atomic_dec_and_test(&io->count)) { 120 if (atomic_dec_and_test(&io->count)) {
121 if (io->vma_invalidate_size)
122 invalidate_kernel_vmap_range(io->vma_invalidate_address,
123 io->vma_invalidate_size);
124
119 if (io->sleeper) 125 if (io->sleeper)
120 wake_up_process(io->sleeper); 126 wake_up_process(io->sleeper);
121 127
@@ -159,6 +165,9 @@ struct dpages {
159 165
160 unsigned context_u; 166 unsigned context_u;
161 void *context_ptr; 167 void *context_ptr;
168
169 void *vma_invalidate_address;
170 unsigned long vma_invalidate_size;
162}; 171};
163 172
164/* 173/*
@@ -377,6 +386,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
377 io->sleeper = current; 386 io->sleeper = current;
378 io->client = client; 387 io->client = client;
379 388
389 io->vma_invalidate_address = dp->vma_invalidate_address;
390 io->vma_invalidate_size = dp->vma_invalidate_size;
391
380 dispatch_io(rw, num_regions, where, dp, io, 1); 392 dispatch_io(rw, num_regions, where, dp, io, 1);
381 393
382 while (1) { 394 while (1) {
@@ -415,13 +427,21 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
415 io->callback = fn; 427 io->callback = fn;
416 io->context = context; 428 io->context = context;
417 429
430 io->vma_invalidate_address = dp->vma_invalidate_address;
431 io->vma_invalidate_size = dp->vma_invalidate_size;
432
418 dispatch_io(rw, num_regions, where, dp, io, 0); 433 dispatch_io(rw, num_regions, where, dp, io, 0);
419 return 0; 434 return 0;
420} 435}
421 436
422static int dp_init(struct dm_io_request *io_req, struct dpages *dp) 437static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
438 unsigned long size)
423{ 439{
424 /* Set up dpages based on memory type */ 440 /* Set up dpages based on memory type */
441
442 dp->vma_invalidate_address = NULL;
443 dp->vma_invalidate_size = 0;
444
425 switch (io_req->mem.type) { 445 switch (io_req->mem.type) {
426 case DM_IO_PAGE_LIST: 446 case DM_IO_PAGE_LIST:
427 list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); 447 list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
@@ -432,6 +452,11 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
432 break; 452 break;
433 453
434 case DM_IO_VMA: 454 case DM_IO_VMA:
455 flush_kernel_vmap_range(io_req->mem.ptr.vma, size);
456 if ((io_req->bi_rw & RW_MASK) == READ) {
457 dp->vma_invalidate_address = io_req->mem.ptr.vma;
458 dp->vma_invalidate_size = size;
459 }
435 vm_dp_init(dp, io_req->mem.ptr.vma); 460 vm_dp_init(dp, io_req->mem.ptr.vma);
436 break; 461 break;
437 462
@@ -460,7 +485,7 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
460 int r; 485 int r;
461 struct dpages dp; 486 struct dpages dp;
462 487
463 r = dp_init(io_req, &dp); 488 r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT);
464 if (r) 489 if (r)
465 return r; 490 return r;
466 491
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4cacdad2270a..2e9a3ca37bdd 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -128,6 +128,24 @@ static struct hash_cell *__get_uuid_cell(const char *str)
128 return NULL; 128 return NULL;
129} 129}
130 130
131static struct hash_cell *__get_dev_cell(uint64_t dev)
132{
133 struct mapped_device *md;
134 struct hash_cell *hc;
135
136 md = dm_get_md(huge_decode_dev(dev));
137 if (!md)
138 return NULL;
139
140 hc = dm_get_mdptr(md);
141 if (!hc) {
142 dm_put(md);
143 return NULL;
144 }
145
146 return hc;
147}
148
131/*----------------------------------------------------------------- 149/*-----------------------------------------------------------------
132 * Inserting, removing and renaming a device. 150 * Inserting, removing and renaming a device.
133 *---------------------------------------------------------------*/ 151 *---------------------------------------------------------------*/
@@ -718,25 +736,45 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
718 */ 736 */
719static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) 737static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
720{ 738{
721 struct mapped_device *md; 739 struct hash_cell *hc = NULL;
722 void *mdptr = NULL;
723 740
724 if (*param->uuid) 741 if (*param->uuid) {
725 return __get_uuid_cell(param->uuid); 742 if (*param->name || param->dev)
743 return NULL;
726 744
727 if (*param->name) 745 hc = __get_uuid_cell(param->uuid);
728 return __get_name_cell(param->name); 746 if (!hc)
747 return NULL;
748 } else if (*param->name) {
749 if (param->dev)
750 return NULL;
729 751
730 md = dm_get_md(huge_decode_dev(param->dev)); 752 hc = __get_name_cell(param->name);
731 if (!md) 753 if (!hc)
732 goto out; 754 return NULL;
755 } else if (param->dev) {
756 hc = __get_dev_cell(param->dev);
757 if (!hc)
758 return NULL;
759 } else
760 return NULL;
733 761
734 mdptr = dm_get_mdptr(md); 762 /*
735 if (!mdptr) 763 * Sneakily write in both the name and the uuid
736 dm_put(md); 764 * while we have the cell.
765 */
766 strlcpy(param->name, hc->name, sizeof(param->name));
767 if (hc->uuid)
768 strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
769 else
770 param->uuid[0] = '\0';
737 771
738out: 772 if (hc->new_map)
739 return mdptr; 773 param->flags |= DM_INACTIVE_PRESENT_FLAG;
774 else
775 param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
776
777 return hc;
740} 778}
741 779
742static struct mapped_device *find_device(struct dm_ioctl *param) 780static struct mapped_device *find_device(struct dm_ioctl *param)
@@ -746,24 +784,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
746 784
747 down_read(&_hash_lock); 785 down_read(&_hash_lock);
748 hc = __find_device_hash_cell(param); 786 hc = __find_device_hash_cell(param);
749 if (hc) { 787 if (hc)
750 md = hc->md; 788 md = hc->md;
751
752 /*
753 * Sneakily write in both the name and the uuid
754 * while we have the cell.
755 */
756 strlcpy(param->name, hc->name, sizeof(param->name));
757 if (hc->uuid)
758 strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
759 else
760 param->uuid[0] = '\0';
761
762 if (hc->new_map)
763 param->flags |= DM_INACTIVE_PRESENT_FLAG;
764 else
765 param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
766 }
767 up_read(&_hash_lock); 789 up_read(&_hash_lock);
768 790
769 return md; 791 return md;
@@ -1402,6 +1424,11 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1402 goto out; 1424 goto out;
1403 } 1425 }
1404 1426
1427 if (!argc) {
1428 DMWARN("Empty message received.");
1429 goto out;
1430 }
1431
1405 table = dm_get_live_table(md); 1432 table = dm_get_live_table(md);
1406 if (!table) 1433 if (!table)
1407 goto out_argv; 1434 goto out_argv;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 320401dec104..f82147029636 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -224,8 +224,6 @@ struct kcopyd_job {
224 unsigned int num_dests; 224 unsigned int num_dests;
225 struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; 225 struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
226 226
227 sector_t offset;
228 unsigned int nr_pages;
229 struct page_list *pages; 227 struct page_list *pages;
230 228
231 /* 229 /*
@@ -380,7 +378,7 @@ static int run_io_job(struct kcopyd_job *job)
380 .bi_rw = job->rw, 378 .bi_rw = job->rw,
381 .mem.type = DM_IO_PAGE_LIST, 379 .mem.type = DM_IO_PAGE_LIST,
382 .mem.ptr.pl = job->pages, 380 .mem.ptr.pl = job->pages,
383 .mem.offset = job->offset, 381 .mem.offset = 0,
384 .notify.fn = complete_io, 382 .notify.fn = complete_io,
385 .notify.context = job, 383 .notify.context = job,
386 .client = job->kc->io_client, 384 .client = job->kc->io_client,
@@ -397,10 +395,9 @@ static int run_io_job(struct kcopyd_job *job)
397static int run_pages_job(struct kcopyd_job *job) 395static int run_pages_job(struct kcopyd_job *job)
398{ 396{
399 int r; 397 int r;
398 unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);
400 399
401 job->nr_pages = dm_div_up(job->dests[0].count + job->offset, 400 r = kcopyd_get_pages(job->kc, nr_pages, &job->pages);
402 PAGE_SIZE >> 9);
403 r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
404 if (!r) { 401 if (!r) {
405 /* this job is ready for io */ 402 /* this job is ready for io */
406 push(&job->kc->io_jobs, job); 403 push(&job->kc->io_jobs, job);
@@ -602,8 +599,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
602 job->num_dests = num_dests; 599 job->num_dests = num_dests;
603 memcpy(&job->dests, dests, sizeof(*dests) * num_dests); 600 memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
604 601
605 job->offset = 0;
606 job->nr_pages = 0;
607 job->pages = NULL; 602 job->pages = NULL;
608 603
609 job->fn = fn; 604 job->fn = fn;
@@ -622,6 +617,37 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
622} 617}
623EXPORT_SYMBOL(dm_kcopyd_copy); 618EXPORT_SYMBOL(dm_kcopyd_copy);
624 619
620void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
621 dm_kcopyd_notify_fn fn, void *context)
622{
623 struct kcopyd_job *job;
624
625 job = mempool_alloc(kc->job_pool, GFP_NOIO);
626
627 memset(job, 0, sizeof(struct kcopyd_job));
628 job->kc = kc;
629 job->fn = fn;
630 job->context = context;
631
632 atomic_inc(&kc->nr_jobs);
633
634 return job;
635}
636EXPORT_SYMBOL(dm_kcopyd_prepare_callback);
637
638void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
639{
640 struct kcopyd_job *job = j;
641 struct dm_kcopyd_client *kc = job->kc;
642
643 job->read_err = read_err;
644 job->write_err = write_err;
645
646 push(&kc->complete_jobs, job);
647 wake(kc);
648}
649EXPORT_SYMBOL(dm_kcopyd_do_callback);
650
625/* 651/*
626 * Cancels a kcopyd job, eg. someone might be deactivating a 652 * Cancels a kcopyd job, eg. someone might be deactivating a
627 * mirror. 653 * mirror.
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index aa2e0c374ab3..1021c8986011 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -394,8 +394,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
394 group[count] = fe->region; 394 group[count] = fe->region;
395 count++; 395 count++;
396 396
397 list_del(&fe->list); 397 list_move(&fe->list, &tmp_list);
398 list_add(&fe->list, &tmp_list);
399 398
400 type = fe->type; 399 type = fe->type;
401 if (count >= MAX_FLUSH_GROUP_COUNT) 400 if (count >= MAX_FLUSH_GROUP_COUNT)
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 948e3f4925bf..3b52bb72bd1f 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -197,15 +197,21 @@ EXPORT_SYMBOL(dm_dirty_log_destroy);
197#define MIRROR_DISK_VERSION 2 197#define MIRROR_DISK_VERSION 2
198#define LOG_OFFSET 2 198#define LOG_OFFSET 2
199 199
200struct log_header { 200struct log_header_disk {
201 uint32_t magic; 201 __le32 magic;
202 202
203 /* 203 /*
204 * Simple, incrementing version. no backward 204 * Simple, incrementing version. no backward
205 * compatibility. 205 * compatibility.
206 */ 206 */
207 __le32 version;
208 __le64 nr_regions;
209} __packed;
210
211struct log_header_core {
212 uint32_t magic;
207 uint32_t version; 213 uint32_t version;
208 sector_t nr_regions; 214 uint64_t nr_regions;
209}; 215};
210 216
211struct log_c { 217struct log_c {
@@ -239,10 +245,10 @@ struct log_c {
239 int log_dev_failed; 245 int log_dev_failed;
240 int log_dev_flush_failed; 246 int log_dev_flush_failed;
241 struct dm_dev *log_dev; 247 struct dm_dev *log_dev;
242 struct log_header header; 248 struct log_header_core header;
243 249
244 struct dm_io_region header_location; 250 struct dm_io_region header_location;
245 struct log_header *disk_header; 251 struct log_header_disk *disk_header;
246}; 252};
247 253
248/* 254/*
@@ -251,34 +257,34 @@ struct log_c {
251 */ 257 */
252static inline int log_test_bit(uint32_t *bs, unsigned bit) 258static inline int log_test_bit(uint32_t *bs, unsigned bit)
253{ 259{
254 return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0; 260 return test_bit_le(bit, bs) ? 1 : 0;
255} 261}
256 262
257static inline void log_set_bit(struct log_c *l, 263static inline void log_set_bit(struct log_c *l,
258 uint32_t *bs, unsigned bit) 264 uint32_t *bs, unsigned bit)
259{ 265{
260 __test_and_set_bit_le(bit, (unsigned long *) bs); 266 __set_bit_le(bit, bs);
261 l->touched_cleaned = 1; 267 l->touched_cleaned = 1;
262} 268}
263 269
264static inline void log_clear_bit(struct log_c *l, 270static inline void log_clear_bit(struct log_c *l,
265 uint32_t *bs, unsigned bit) 271 uint32_t *bs, unsigned bit)
266{ 272{
267 __test_and_clear_bit_le(bit, (unsigned long *) bs); 273 __clear_bit_le(bit, bs);
268 l->touched_dirtied = 1; 274 l->touched_dirtied = 1;
269} 275}
270 276
271/*---------------------------------------------------------------- 277/*----------------------------------------------------------------
272 * Header IO 278 * Header IO
273 *--------------------------------------------------------------*/ 279 *--------------------------------------------------------------*/
274static void header_to_disk(struct log_header *core, struct log_header *disk) 280static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk)
275{ 281{
276 disk->magic = cpu_to_le32(core->magic); 282 disk->magic = cpu_to_le32(core->magic);
277 disk->version = cpu_to_le32(core->version); 283 disk->version = cpu_to_le32(core->version);
278 disk->nr_regions = cpu_to_le64(core->nr_regions); 284 disk->nr_regions = cpu_to_le64(core->nr_regions);
279} 285}
280 286
281static void header_from_disk(struct log_header *core, struct log_header *disk) 287static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk)
282{ 288{
283 core->magic = le32_to_cpu(disk->magic); 289 core->magic = le32_to_cpu(disk->magic);
284 core->version = le32_to_cpu(disk->version); 290 core->version = le32_to_cpu(disk->version);
@@ -486,7 +492,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
486 memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); 492 memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
487 lc->sync_count = (sync == NOSYNC) ? region_count : 0; 493 lc->sync_count = (sync == NOSYNC) ? region_count : 0;
488 494
489 lc->recovering_bits = vmalloc(bitset_size); 495 lc->recovering_bits = vzalloc(bitset_size);
490 if (!lc->recovering_bits) { 496 if (!lc->recovering_bits) {
491 DMWARN("couldn't allocate sync bitset"); 497 DMWARN("couldn't allocate sync bitset");
492 vfree(lc->sync_bits); 498 vfree(lc->sync_bits);
@@ -498,7 +504,6 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
498 kfree(lc); 504 kfree(lc);
499 return -ENOMEM; 505 return -ENOMEM;
500 } 506 }
501 memset(lc->recovering_bits, 0, bitset_size);
502 lc->sync_search = 0; 507 lc->sync_search = 0;
503 log->context = lc; 508 log->context = lc;
504 509
@@ -739,8 +744,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
739 return 0; 744 return 0;
740 745
741 do { 746 do {
742 *region = find_next_zero_bit_le( 747 *region = find_next_zero_bit_le(lc->sync_bits,
743 (unsigned long *) lc->sync_bits,
744 lc->region_count, 748 lc->region_count,
745 lc->sync_search); 749 lc->sync_search);
746 lc->sync_search = *region + 1; 750 lc->sync_search = *region + 1;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c3547016f0f1..5e0090ef4182 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -22,7 +22,6 @@
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23 23
24#define DM_MSG_PREFIX "multipath" 24#define DM_MSG_PREFIX "multipath"
25#define MESG_STR(x) x, sizeof(x)
26#define DM_PG_INIT_DELAY_MSECS 2000 25#define DM_PG_INIT_DELAY_MSECS 2000
27#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) 26#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
28 27
@@ -505,80 +504,29 @@ static void trigger_event(struct work_struct *work)
505 * <#paths> <#per-path selector args> 504 * <#paths> <#per-path selector args>
506 * [<path> [<arg>]* ]+ ]+ 505 * [<path> [<arg>]* ]+ ]+
507 *---------------------------------------------------------------*/ 506 *---------------------------------------------------------------*/
508struct param { 507static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
509 unsigned min;
510 unsigned max;
511 char *error;
512};
513
514static int read_param(struct param *param, char *str, unsigned *v, char **error)
515{
516 if (!str ||
517 (sscanf(str, "%u", v) != 1) ||
518 (*v < param->min) ||
519 (*v > param->max)) {
520 *error = param->error;
521 return -EINVAL;
522 }
523
524 return 0;
525}
526
527struct arg_set {
528 unsigned argc;
529 char **argv;
530};
531
532static char *shift(struct arg_set *as)
533{
534 char *r;
535
536 if (as->argc) {
537 as->argc--;
538 r = *as->argv;
539 as->argv++;
540 return r;
541 }
542
543 return NULL;
544}
545
546static void consume(struct arg_set *as, unsigned n)
547{
548 BUG_ON (as->argc < n);
549 as->argc -= n;
550 as->argv += n;
551}
552
553static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
554 struct dm_target *ti) 508 struct dm_target *ti)
555{ 509{
556 int r; 510 int r;
557 struct path_selector_type *pst; 511 struct path_selector_type *pst;
558 unsigned ps_argc; 512 unsigned ps_argc;
559 513
560 static struct param _params[] = { 514 static struct dm_arg _args[] = {
561 {0, 1024, "invalid number of path selector args"}, 515 {0, 1024, "invalid number of path selector args"},
562 }; 516 };
563 517
564 pst = dm_get_path_selector(shift(as)); 518 pst = dm_get_path_selector(dm_shift_arg(as));
565 if (!pst) { 519 if (!pst) {
566 ti->error = "unknown path selector type"; 520 ti->error = "unknown path selector type";
567 return -EINVAL; 521 return -EINVAL;
568 } 522 }
569 523
570 r = read_param(_params, shift(as), &ps_argc, &ti->error); 524 r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
571 if (r) { 525 if (r) {
572 dm_put_path_selector(pst); 526 dm_put_path_selector(pst);
573 return -EINVAL; 527 return -EINVAL;
574 } 528 }
575 529
576 if (ps_argc > as->argc) {
577 dm_put_path_selector(pst);
578 ti->error = "not enough arguments for path selector";
579 return -EINVAL;
580 }
581
582 r = pst->create(&pg->ps, ps_argc, as->argv); 530 r = pst->create(&pg->ps, ps_argc, as->argv);
583 if (r) { 531 if (r) {
584 dm_put_path_selector(pst); 532 dm_put_path_selector(pst);
@@ -587,12 +535,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
587 } 535 }
588 536
589 pg->ps.type = pst; 537 pg->ps.type = pst;
590 consume(as, ps_argc); 538 dm_consume_args(as, ps_argc);
591 539
592 return 0; 540 return 0;
593} 541}
594 542
595static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, 543static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
596 struct dm_target *ti) 544 struct dm_target *ti)
597{ 545{
598 int r; 546 int r;
@@ -609,7 +557,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
609 if (!p) 557 if (!p)
610 return ERR_PTR(-ENOMEM); 558 return ERR_PTR(-ENOMEM);
611 559
612 r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table), 560 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
613 &p->path.dev); 561 &p->path.dev);
614 if (r) { 562 if (r) {
615 ti->error = "error getting device"; 563 ti->error = "error getting device";
@@ -660,16 +608,16 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
660 return ERR_PTR(r); 608 return ERR_PTR(r);
661} 609}
662 610
663static struct priority_group *parse_priority_group(struct arg_set *as, 611static struct priority_group *parse_priority_group(struct dm_arg_set *as,
664 struct multipath *m) 612 struct multipath *m)
665{ 613{
666 static struct param _params[] = { 614 static struct dm_arg _args[] = {
667 {1, 1024, "invalid number of paths"}, 615 {1, 1024, "invalid number of paths"},
668 {0, 1024, "invalid number of selector args"} 616 {0, 1024, "invalid number of selector args"}
669 }; 617 };
670 618
671 int r; 619 int r;
672 unsigned i, nr_selector_args, nr_params; 620 unsigned i, nr_selector_args, nr_args;
673 struct priority_group *pg; 621 struct priority_group *pg;
674 struct dm_target *ti = m->ti; 622 struct dm_target *ti = m->ti;
675 623
@@ -693,26 +641,26 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
693 /* 641 /*
694 * read the paths 642 * read the paths
695 */ 643 */
696 r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error); 644 r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
697 if (r) 645 if (r)
698 goto bad; 646 goto bad;
699 647
700 r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error); 648 r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
701 if (r) 649 if (r)
702 goto bad; 650 goto bad;
703 651
704 nr_params = 1 + nr_selector_args; 652 nr_args = 1 + nr_selector_args;
705 for (i = 0; i < pg->nr_pgpaths; i++) { 653 for (i = 0; i < pg->nr_pgpaths; i++) {
706 struct pgpath *pgpath; 654 struct pgpath *pgpath;
707 struct arg_set path_args; 655 struct dm_arg_set path_args;
708 656
709 if (as->argc < nr_params) { 657 if (as->argc < nr_args) {
710 ti->error = "not enough path parameters"; 658 ti->error = "not enough path parameters";
711 r = -EINVAL; 659 r = -EINVAL;
712 goto bad; 660 goto bad;
713 } 661 }
714 662
715 path_args.argc = nr_params; 663 path_args.argc = nr_args;
716 path_args.argv = as->argv; 664 path_args.argv = as->argv;
717 665
718 pgpath = parse_path(&path_args, &pg->ps, ti); 666 pgpath = parse_path(&path_args, &pg->ps, ti);
@@ -723,7 +671,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
723 671
724 pgpath->pg = pg; 672 pgpath->pg = pg;
725 list_add_tail(&pgpath->list, &pg->pgpaths); 673 list_add_tail(&pgpath->list, &pg->pgpaths);
726 consume(as, nr_params); 674 dm_consume_args(as, nr_args);
727 } 675 }
728 676
729 return pg; 677 return pg;
@@ -733,28 +681,23 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
733 return ERR_PTR(r); 681 return ERR_PTR(r);
734} 682}
735 683
736static int parse_hw_handler(struct arg_set *as, struct multipath *m) 684static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
737{ 685{
738 unsigned hw_argc; 686 unsigned hw_argc;
739 int ret; 687 int ret;
740 struct dm_target *ti = m->ti; 688 struct dm_target *ti = m->ti;
741 689
742 static struct param _params[] = { 690 static struct dm_arg _args[] = {
743 {0, 1024, "invalid number of hardware handler args"}, 691 {0, 1024, "invalid number of hardware handler args"},
744 }; 692 };
745 693
746 if (read_param(_params, shift(as), &hw_argc, &ti->error)) 694 if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
747 return -EINVAL; 695 return -EINVAL;
748 696
749 if (!hw_argc) 697 if (!hw_argc)
750 return 0; 698 return 0;
751 699
752 if (hw_argc > as->argc) { 700 m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
753 ti->error = "not enough arguments for hardware handler";
754 return -EINVAL;
755 }
756
757 m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
758 request_module("scsi_dh_%s", m->hw_handler_name); 701 request_module("scsi_dh_%s", m->hw_handler_name);
759 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { 702 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
760 ti->error = "unknown hardware handler type"; 703 ti->error = "unknown hardware handler type";
@@ -778,7 +721,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
778 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) 721 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
779 j = sprintf(p, "%s", as->argv[i]); 722 j = sprintf(p, "%s", as->argv[i]);
780 } 723 }
781 consume(as, hw_argc - 1); 724 dm_consume_args(as, hw_argc - 1);
782 725
783 return 0; 726 return 0;
784fail: 727fail:
@@ -787,20 +730,20 @@ fail:
787 return ret; 730 return ret;
788} 731}
789 732
790static int parse_features(struct arg_set *as, struct multipath *m) 733static int parse_features(struct dm_arg_set *as, struct multipath *m)
791{ 734{
792 int r; 735 int r;
793 unsigned argc; 736 unsigned argc;
794 struct dm_target *ti = m->ti; 737 struct dm_target *ti = m->ti;
795 const char *param_name; 738 const char *arg_name;
796 739
797 static struct param _params[] = { 740 static struct dm_arg _args[] = {
798 {0, 5, "invalid number of feature args"}, 741 {0, 5, "invalid number of feature args"},
799 {1, 50, "pg_init_retries must be between 1 and 50"}, 742 {1, 50, "pg_init_retries must be between 1 and 50"},
800 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, 743 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
801 }; 744 };
802 745
803 r = read_param(_params, shift(as), &argc, &ti->error); 746 r = dm_read_arg_group(_args, as, &argc, &ti->error);
804 if (r) 747 if (r)
805 return -EINVAL; 748 return -EINVAL;
806 749
@@ -808,26 +751,24 @@ static int parse_features(struct arg_set *as, struct multipath *m)
808 return 0; 751 return 0;
809 752
810 do { 753 do {
811 param_name = shift(as); 754 arg_name = dm_shift_arg(as);
812 argc--; 755 argc--;
813 756
814 if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) { 757 if (!strcasecmp(arg_name, "queue_if_no_path")) {
815 r = queue_if_no_path(m, 1, 0); 758 r = queue_if_no_path(m, 1, 0);
816 continue; 759 continue;
817 } 760 }
818 761
819 if (!strnicmp(param_name, MESG_STR("pg_init_retries")) && 762 if (!strcasecmp(arg_name, "pg_init_retries") &&
820 (argc >= 1)) { 763 (argc >= 1)) {
821 r = read_param(_params + 1, shift(as), 764 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
822 &m->pg_init_retries, &ti->error);
823 argc--; 765 argc--;
824 continue; 766 continue;
825 } 767 }
826 768
827 if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) && 769 if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
828 (argc >= 1)) { 770 (argc >= 1)) {
829 r = read_param(_params + 2, shift(as), 771 r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
830 &m->pg_init_delay_msecs, &ti->error);
831 argc--; 772 argc--;
832 continue; 773 continue;
833 } 774 }
@@ -842,15 +783,15 @@ static int parse_features(struct arg_set *as, struct multipath *m)
842static int multipath_ctr(struct dm_target *ti, unsigned int argc, 783static int multipath_ctr(struct dm_target *ti, unsigned int argc,
843 char **argv) 784 char **argv)
844{ 785{
845 /* target parameters */ 786 /* target arguments */
846 static struct param _params[] = { 787 static struct dm_arg _args[] = {
847 {0, 1024, "invalid number of priority groups"}, 788 {0, 1024, "invalid number of priority groups"},
848 {0, 1024, "invalid initial priority group number"}, 789 {0, 1024, "invalid initial priority group number"},
849 }; 790 };
850 791
851 int r; 792 int r;
852 struct multipath *m; 793 struct multipath *m;
853 struct arg_set as; 794 struct dm_arg_set as;
854 unsigned pg_count = 0; 795 unsigned pg_count = 0;
855 unsigned next_pg_num; 796 unsigned next_pg_num;
856 797
@@ -871,11 +812,11 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
871 if (r) 812 if (r)
872 goto bad; 813 goto bad;
873 814
874 r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error); 815 r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
875 if (r) 816 if (r)
876 goto bad; 817 goto bad;
877 818
878 r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error); 819 r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
879 if (r) 820 if (r)
880 goto bad; 821 goto bad;
881 822
@@ -1505,10 +1446,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1505 } 1446 }
1506 1447
1507 if (argc == 1) { 1448 if (argc == 1) {
1508 if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { 1449 if (!strcasecmp(argv[0], "queue_if_no_path")) {
1509 r = queue_if_no_path(m, 1, 0); 1450 r = queue_if_no_path(m, 1, 0);
1510 goto out; 1451 goto out;
1511 } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) { 1452 } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1512 r = queue_if_no_path(m, 0, 0); 1453 r = queue_if_no_path(m, 0, 0);
1513 goto out; 1454 goto out;
1514 } 1455 }
@@ -1519,18 +1460,18 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1519 goto out; 1460 goto out;
1520 } 1461 }
1521 1462
1522 if (!strnicmp(argv[0], MESG_STR("disable_group"))) { 1463 if (!strcasecmp(argv[0], "disable_group")) {
1523 r = bypass_pg_num(m, argv[1], 1); 1464 r = bypass_pg_num(m, argv[1], 1);
1524 goto out; 1465 goto out;
1525 } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) { 1466 } else if (!strcasecmp(argv[0], "enable_group")) {
1526 r = bypass_pg_num(m, argv[1], 0); 1467 r = bypass_pg_num(m, argv[1], 0);
1527 goto out; 1468 goto out;
1528 } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) { 1469 } else if (!strcasecmp(argv[0], "switch_group")) {
1529 r = switch_pg_num(m, argv[1]); 1470 r = switch_pg_num(m, argv[1]);
1530 goto out; 1471 goto out;
1531 } else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) 1472 } else if (!strcasecmp(argv[0], "reinstate_path"))
1532 action = reinstate_path; 1473 action = reinstate_path;
1533 else if (!strnicmp(argv[0], MESG_STR("fail_path"))) 1474 else if (!strcasecmp(argv[0], "fail_path"))
1534 action = fail_path; 1475 action = fail_path;
1535 else { 1476 else {
1536 DMWARN("Unrecognised multipath message received."); 1477 DMWARN("Unrecognised multipath message received.");
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e5d8904fc8f6..a002dd85db1e 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -8,19 +8,19 @@
8#include <linux/slab.h> 8#include <linux/slab.h>
9 9
10#include "md.h" 10#include "md.h"
11#include "raid1.h"
11#include "raid5.h" 12#include "raid5.h"
12#include "dm.h"
13#include "bitmap.h" 13#include "bitmap.h"
14 14
15#include <linux/device-mapper.h>
16
15#define DM_MSG_PREFIX "raid" 17#define DM_MSG_PREFIX "raid"
16 18
17/* 19/*
18 * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then 20 * The following flags are used by dm-raid.c to set up the array state.
19 * make it so the flag doesn't set anything. 21 * They must be cleared before md_run is called.
20 */ 22 */
21#ifndef MD_SYNC_STATE_FORCED 23#define FirstUse 10 /* rdev flag */
22#define MD_SYNC_STATE_FORCED 0
23#endif
24 24
25struct raid_dev { 25struct raid_dev {
26 /* 26 /*
@@ -43,14 +43,15 @@ struct raid_dev {
43/* 43/*
44 * Flags for rs->print_flags field. 44 * Flags for rs->print_flags field.
45 */ 45 */
46#define DMPF_DAEMON_SLEEP 0x1 46#define DMPF_SYNC 0x1
47#define DMPF_MAX_WRITE_BEHIND 0x2 47#define DMPF_NOSYNC 0x2
48#define DMPF_SYNC 0x4 48#define DMPF_REBUILD 0x4
49#define DMPF_NOSYNC 0x8 49#define DMPF_DAEMON_SLEEP 0x8
50#define DMPF_STRIPE_CACHE 0x10 50#define DMPF_MIN_RECOVERY_RATE 0x10
51#define DMPF_MIN_RECOVERY_RATE 0x20 51#define DMPF_MAX_RECOVERY_RATE 0x20
52#define DMPF_MAX_RECOVERY_RATE 0x40 52#define DMPF_MAX_WRITE_BEHIND 0x40
53 53#define DMPF_STRIPE_CACHE 0x80
54#define DMPF_REGION_SIZE 0X100
54struct raid_set { 55struct raid_set {
55 struct dm_target *ti; 56 struct dm_target *ti;
56 57
@@ -72,6 +73,7 @@ static struct raid_type {
72 const unsigned level; /* RAID level. */ 73 const unsigned level; /* RAID level. */
73 const unsigned algorithm; /* RAID algorithm. */ 74 const unsigned algorithm; /* RAID algorithm. */
74} raid_types[] = { 75} raid_types[] = {
76 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
75 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, 77 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
76 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 78 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
77 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, 79 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -105,7 +107,8 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
105 } 107 }
106 108
107 sectors_per_dev = ti->len; 109 sectors_per_dev = ti->len;
108 if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { 110 if ((raid_type->level > 1) &&
111 sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
109 ti->error = "Target length not divisible by number of data devices"; 112 ti->error = "Target length not divisible by number of data devices";
110 return ERR_PTR(-EINVAL); 113 return ERR_PTR(-EINVAL);
111 } 114 }
@@ -147,9 +150,16 @@ static void context_free(struct raid_set *rs)
147{ 150{
148 int i; 151 int i;
149 152
150 for (i = 0; i < rs->md.raid_disks; i++) 153 for (i = 0; i < rs->md.raid_disks; i++) {
154 if (rs->dev[i].meta_dev)
155 dm_put_device(rs->ti, rs->dev[i].meta_dev);
156 if (rs->dev[i].rdev.sb_page)
157 put_page(rs->dev[i].rdev.sb_page);
158 rs->dev[i].rdev.sb_page = NULL;
159 rs->dev[i].rdev.sb_loaded = 0;
151 if (rs->dev[i].data_dev) 160 if (rs->dev[i].data_dev)
152 dm_put_device(rs->ti, rs->dev[i].data_dev); 161 dm_put_device(rs->ti, rs->dev[i].data_dev);
162 }
153 163
154 kfree(rs); 164 kfree(rs);
155} 165}
@@ -159,7 +169,16 @@ static void context_free(struct raid_set *rs)
159 * <meta_dev>: meta device name or '-' if missing 169 * <meta_dev>: meta device name or '-' if missing
160 * <data_dev>: data device name or '-' if missing 170 * <data_dev>: data device name or '-' if missing
161 * 171 *
162 * This code parses those words. 172 * The following are permitted:
173 * - -
174 * - <data_dev>
175 * <meta_dev> <data_dev>
176 *
177 * The following is not allowed:
178 * <meta_dev> -
179 *
180 * This code parses those words. If there is a failure,
181 * the caller must use context_free to unwind the operations.
163 */ 182 */
164static int dev_parms(struct raid_set *rs, char **argv) 183static int dev_parms(struct raid_set *rs, char **argv)
165{ 184{
@@ -182,8 +201,16 @@ static int dev_parms(struct raid_set *rs, char **argv)
182 rs->dev[i].rdev.mddev = &rs->md; 201 rs->dev[i].rdev.mddev = &rs->md;
183 202
184 if (strcmp(argv[0], "-")) { 203 if (strcmp(argv[0], "-")) {
185 rs->ti->error = "Metadata devices not supported"; 204 ret = dm_get_device(rs->ti, argv[0],
186 return -EINVAL; 205 dm_table_get_mode(rs->ti->table),
206 &rs->dev[i].meta_dev);
207 rs->ti->error = "RAID metadata device lookup failure";
208 if (ret)
209 return ret;
210
211 rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
212 if (!rs->dev[i].rdev.sb_page)
213 return -ENOMEM;
187 } 214 }
188 215
189 if (!strcmp(argv[1], "-")) { 216 if (!strcmp(argv[1], "-")) {
@@ -193,6 +220,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
193 return -EINVAL; 220 return -EINVAL;
194 } 221 }
195 222
223 rs->ti->error = "No data device supplied with metadata device";
224 if (rs->dev[i].meta_dev)
225 return -EINVAL;
226
196 continue; 227 continue;
197 } 228 }
198 229
@@ -204,6 +235,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
204 return ret; 235 return ret;
205 } 236 }
206 237
238 if (rs->dev[i].meta_dev) {
239 metadata_available = 1;
240 rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
241 }
207 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; 242 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
208 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); 243 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
209 if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 244 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
@@ -235,33 +270,109 @@ static int dev_parms(struct raid_set *rs, char **argv)
235} 270}
236 271
237/* 272/*
273 * validate_region_size
274 * @rs
275 * @region_size: region size in sectors. If 0, pick a size (4MiB default).
276 *
277 * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
278 * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
279 *
280 * Returns: 0 on success, -EINVAL on failure.
281 */
282static int validate_region_size(struct raid_set *rs, unsigned long region_size)
283{
284 unsigned long min_region_size = rs->ti->len / (1 << 21);
285
286 if (!region_size) {
287 /*
288 * Choose a reasonable default. All figures in sectors.
289 */
290 if (min_region_size > (1 << 13)) {
291 DMINFO("Choosing default region size of %lu sectors",
292 region_size);
293 region_size = min_region_size;
294 } else {
295 DMINFO("Choosing default region size of 4MiB");
296 region_size = 1 << 13; /* sectors */
297 }
298 } else {
299 /*
300 * Validate user-supplied value.
301 */
302 if (region_size > rs->ti->len) {
303 rs->ti->error = "Supplied region size is too large";
304 return -EINVAL;
305 }
306
307 if (region_size < min_region_size) {
308 DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
309 region_size, min_region_size);
310 rs->ti->error = "Supplied region size is too small";
311 return -EINVAL;
312 }
313
314 if (!is_power_of_2(region_size)) {
315 rs->ti->error = "Region size is not a power of 2";
316 return -EINVAL;
317 }
318
319 if (region_size < rs->md.chunk_sectors) {
320 rs->ti->error = "Region size is smaller than the chunk size";
321 return -EINVAL;
322 }
323 }
324
325 /*
326 * Convert sectors to bytes.
327 */
328 rs->md.bitmap_info.chunksize = (region_size << 9);
329
330 return 0;
331}
332
333/*
238 * Possible arguments are... 334 * Possible arguments are...
239 * RAID456:
240 * <chunk_size> [optional_args] 335 * <chunk_size> [optional_args]
241 * 336 *
242 * Optional args: 337 * Argument definitions
243 * [[no]sync] Force or prevent recovery of the entire array 338 * <chunk_size> The number of sectors per disk that
339 * will form the "stripe"
340 * [[no]sync] Force or prevent recovery of the
341 * entire array
244 * [rebuild <idx>] Rebuild the drive indicated by the index 342 * [rebuild <idx>] Rebuild the drive indicated by the index
245 * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits 343 * [daemon_sleep <ms>] Time between bitmap daemon work to
344 * clear bits
246 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization 345 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
247 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization 346 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
347 * [write_mostly <idx>] Indicate a write mostly drive via index
248 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 348 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
249 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 349 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
350 * [region_size <sectors>] Defines granularity of bitmap
250 */ 351 */
251static int parse_raid_params(struct raid_set *rs, char **argv, 352static int parse_raid_params(struct raid_set *rs, char **argv,
252 unsigned num_raid_params) 353 unsigned num_raid_params)
253{ 354{
254 unsigned i, rebuild_cnt = 0; 355 unsigned i, rebuild_cnt = 0;
255 unsigned long value; 356 unsigned long value, region_size = 0;
256 char *key; 357 char *key;
257 358
258 /* 359 /*
259 * First, parse the in-order required arguments 360 * First, parse the in-order required arguments
361 * "chunk_size" is the only argument of this type.
260 */ 362 */
261 if ((strict_strtoul(argv[0], 10, &value) < 0) || 363 if ((strict_strtoul(argv[0], 10, &value) < 0)) {
262 !is_power_of_2(value) || (value < 8)) {
263 rs->ti->error = "Bad chunk size"; 364 rs->ti->error = "Bad chunk size";
264 return -EINVAL; 365 return -EINVAL;
366 } else if (rs->raid_type->level == 1) {
367 if (value)
368 DMERR("Ignoring chunk size parameter for RAID 1");
369 value = 0;
370 } else if (!is_power_of_2(value)) {
371 rs->ti->error = "Chunk size must be a power of 2";
372 return -EINVAL;
373 } else if (value < 8) {
374 rs->ti->error = "Chunk size value is too small";
375 return -EINVAL;
265 } 376 }
266 377
267 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; 378 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
@@ -269,22 +380,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
269 num_raid_params--; 380 num_raid_params--;
270 381
271 /* 382 /*
272 * Second, parse the unordered optional arguments 383 * We set each individual device as In_sync with a completed
384 * 'recovery_offset'. If there has been a device failure or
385 * replacement then one of the following cases applies:
386 *
387 * 1) User specifies 'rebuild'.
388 * - Device is reset when param is read.
389 * 2) A new device is supplied.
390 * - No matching superblock found, resets device.
391 * 3) Device failure was transient and returns on reload.
392 * - Failure noticed, resets device for bitmap replay.
393 * 4) Device hadn't completed recovery after previous failure.
394 * - Superblock is read and overrides recovery_offset.
395 *
396 * What is found in the superblocks of the devices is always
397 * authoritative, unless 'rebuild' or '[no]sync' was specified.
273 */ 398 */
274 for (i = 0; i < rs->md.raid_disks; i++) 399 for (i = 0; i < rs->md.raid_disks; i++) {
275 set_bit(In_sync, &rs->dev[i].rdev.flags); 400 set_bit(In_sync, &rs->dev[i].rdev.flags);
401 rs->dev[i].rdev.recovery_offset = MaxSector;
402 }
276 403
404 /*
405 * Second, parse the unordered optional arguments
406 */
277 for (i = 0; i < num_raid_params; i++) { 407 for (i = 0; i < num_raid_params; i++) {
278 if (!strcmp(argv[i], "nosync")) { 408 if (!strcasecmp(argv[i], "nosync")) {
279 rs->md.recovery_cp = MaxSector; 409 rs->md.recovery_cp = MaxSector;
280 rs->print_flags |= DMPF_NOSYNC; 410 rs->print_flags |= DMPF_NOSYNC;
281 rs->md.flags |= MD_SYNC_STATE_FORCED;
282 continue; 411 continue;
283 } 412 }
284 if (!strcmp(argv[i], "sync")) { 413 if (!strcasecmp(argv[i], "sync")) {
285 rs->md.recovery_cp = 0; 414 rs->md.recovery_cp = 0;
286 rs->print_flags |= DMPF_SYNC; 415 rs->print_flags |= DMPF_SYNC;
287 rs->md.flags |= MD_SYNC_STATE_FORCED;
288 continue; 416 continue;
289 } 417 }
290 418
@@ -300,9 +428,13 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
300 return -EINVAL; 428 return -EINVAL;
301 } 429 }
302 430
303 if (!strcmp(key, "rebuild")) { 431 if (!strcasecmp(key, "rebuild")) {
304 if (++rebuild_cnt > rs->raid_type->parity_devs) { 432 rebuild_cnt++;
305 rs->ti->error = "Too many rebuild drives given"; 433 if (((rs->raid_type->level != 1) &&
434 (rebuild_cnt > rs->raid_type->parity_devs)) ||
435 ((rs->raid_type->level == 1) &&
436 (rebuild_cnt > (rs->md.raid_disks - 1)))) {
437 rs->ti->error = "Too many rebuild devices specified for given RAID type";
306 return -EINVAL; 438 return -EINVAL;
307 } 439 }
308 if (value > rs->md.raid_disks) { 440 if (value > rs->md.raid_disks) {
@@ -311,7 +443,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
311 } 443 }
312 clear_bit(In_sync, &rs->dev[value].rdev.flags); 444 clear_bit(In_sync, &rs->dev[value].rdev.flags);
313 rs->dev[value].rdev.recovery_offset = 0; 445 rs->dev[value].rdev.recovery_offset = 0;
314 } else if (!strcmp(key, "max_write_behind")) { 446 rs->print_flags |= DMPF_REBUILD;
447 } else if (!strcasecmp(key, "write_mostly")) {
448 if (rs->raid_type->level != 1) {
449 rs->ti->error = "write_mostly option is only valid for RAID1";
450 return -EINVAL;
451 }
452 if (value > rs->md.raid_disks) {
453 rs->ti->error = "Invalid write_mostly drive index given";
454 return -EINVAL;
455 }
456 set_bit(WriteMostly, &rs->dev[value].rdev.flags);
457 } else if (!strcasecmp(key, "max_write_behind")) {
458 if (rs->raid_type->level != 1) {
459 rs->ti->error = "max_write_behind option is only valid for RAID1";
460 return -EINVAL;
461 }
315 rs->print_flags |= DMPF_MAX_WRITE_BEHIND; 462 rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
316 463
317 /* 464 /*
@@ -324,14 +471,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
324 return -EINVAL; 471 return -EINVAL;
325 } 472 }
326 rs->md.bitmap_info.max_write_behind = value; 473 rs->md.bitmap_info.max_write_behind = value;
327 } else if (!strcmp(key, "daemon_sleep")) { 474 } else if (!strcasecmp(key, "daemon_sleep")) {
328 rs->print_flags |= DMPF_DAEMON_SLEEP; 475 rs->print_flags |= DMPF_DAEMON_SLEEP;
329 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { 476 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
330 rs->ti->error = "daemon sleep period out of range"; 477 rs->ti->error = "daemon sleep period out of range";
331 return -EINVAL; 478 return -EINVAL;
332 } 479 }
333 rs->md.bitmap_info.daemon_sleep = value; 480 rs->md.bitmap_info.daemon_sleep = value;
334 } else if (!strcmp(key, "stripe_cache")) { 481 } else if (!strcasecmp(key, "stripe_cache")) {
335 rs->print_flags |= DMPF_STRIPE_CACHE; 482 rs->print_flags |= DMPF_STRIPE_CACHE;
336 483
337 /* 484 /*
@@ -348,20 +495,23 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
348 rs->ti->error = "Bad stripe_cache size"; 495 rs->ti->error = "Bad stripe_cache size";
349 return -EINVAL; 496 return -EINVAL;
350 } 497 }
351 } else if (!strcmp(key, "min_recovery_rate")) { 498 } else if (!strcasecmp(key, "min_recovery_rate")) {
352 rs->print_flags |= DMPF_MIN_RECOVERY_RATE; 499 rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
353 if (value > INT_MAX) { 500 if (value > INT_MAX) {
354 rs->ti->error = "min_recovery_rate out of range"; 501 rs->ti->error = "min_recovery_rate out of range";
355 return -EINVAL; 502 return -EINVAL;
356 } 503 }
357 rs->md.sync_speed_min = (int)value; 504 rs->md.sync_speed_min = (int)value;
358 } else if (!strcmp(key, "max_recovery_rate")) { 505 } else if (!strcasecmp(key, "max_recovery_rate")) {
359 rs->print_flags |= DMPF_MAX_RECOVERY_RATE; 506 rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
360 if (value > INT_MAX) { 507 if (value > INT_MAX) {
361 rs->ti->error = "max_recovery_rate out of range"; 508 rs->ti->error = "max_recovery_rate out of range";
362 return -EINVAL; 509 return -EINVAL;
363 } 510 }
364 rs->md.sync_speed_max = (int)value; 511 rs->md.sync_speed_max = (int)value;
512 } else if (!strcasecmp(key, "region_size")) {
513 rs->print_flags |= DMPF_REGION_SIZE;
514 region_size = value;
365 } else { 515 } else {
366 DMERR("Unable to parse RAID parameter: %s", key); 516 DMERR("Unable to parse RAID parameter: %s", key);
367 rs->ti->error = "Unable to parse RAID parameters"; 517 rs->ti->error = "Unable to parse RAID parameters";
@@ -369,6 +519,19 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
369 } 519 }
370 } 520 }
371 521
522 if (validate_region_size(rs, region_size))
523 return -EINVAL;
524
525 if (rs->md.chunk_sectors)
526 rs->ti->split_io = rs->md.chunk_sectors;
527 else
528 rs->ti->split_io = region_size;
529
530 if (rs->md.chunk_sectors)
531 rs->ti->split_io = rs->md.chunk_sectors;
532 else
533 rs->ti->split_io = region_size;
534
372 /* Assume there are no metadata devices until the drives are parsed */ 535 /* Assume there are no metadata devices until the drives are parsed */
373 rs->md.persistent = 0; 536 rs->md.persistent = 0;
374 rs->md.external = 1; 537 rs->md.external = 1;
@@ -387,17 +550,351 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
387{ 550{
388 struct raid_set *rs = container_of(cb, struct raid_set, callbacks); 551 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
389 552
553 if (rs->raid_type->level == 1)
554 return md_raid1_congested(&rs->md, bits);
555
390 return md_raid5_congested(&rs->md, bits); 556 return md_raid5_congested(&rs->md, bits);
391} 557}
392 558
393/* 559/*
560 * This structure is never routinely used by userspace, unlike md superblocks.
561 * Devices with this superblock should only ever be accessed via device-mapper.
562 */
563#define DM_RAID_MAGIC 0x64526D44
564struct dm_raid_superblock {
565 __le32 magic; /* "DmRd" */
566 __le32 features; /* Used to indicate possible future changes */
567
568 __le32 num_devices; /* Number of devices in this array. (Max 64) */
569 __le32 array_position; /* The position of this drive in the array */
570
571 __le64 events; /* Incremented by md when superblock updated */
572 __le64 failed_devices; /* Bit field of devices to indicate failures */
573
574 /*
575 * This offset tracks the progress of the repair or replacement of
576 * an individual drive.
577 */
578 __le64 disk_recovery_offset;
579
580 /*
581 * This offset tracks the progress of the initial array
582 * synchronisation/parity calculation.
583 */
584 __le64 array_resync_offset;
585
586 /*
587 * RAID characteristics
588 */
589 __le32 level;
590 __le32 layout;
591 __le32 stripe_sectors;
592
593 __u8 pad[452]; /* Round struct to 512 bytes. */
594 /* Always set to 0 when writing. */
595} __packed;
596
597static int read_disk_sb(mdk_rdev_t *rdev, int size)
598{
599 BUG_ON(!rdev->sb_page);
600
601 if (rdev->sb_loaded)
602 return 0;
603
604 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
605 DMERR("Failed to read device superblock");
606 return -EINVAL;
607 }
608
609 rdev->sb_loaded = 1;
610
611 return 0;
612}
613
614static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
615{
616 mdk_rdev_t *r, *t;
617 uint64_t failed_devices;
618 struct dm_raid_superblock *sb;
619
620 sb = page_address(rdev->sb_page);
621 failed_devices = le64_to_cpu(sb->failed_devices);
622
623 rdev_for_each(r, t, mddev)
624 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
625 failed_devices |= (1ULL << r->raid_disk);
626
627 memset(sb, 0, sizeof(*sb));
628
629 sb->magic = cpu_to_le32(DM_RAID_MAGIC);
630 sb->features = cpu_to_le32(0); /* No features yet */
631
632 sb->num_devices = cpu_to_le32(mddev->raid_disks);
633 sb->array_position = cpu_to_le32(rdev->raid_disk);
634
635 sb->events = cpu_to_le64(mddev->events);
636 sb->failed_devices = cpu_to_le64(failed_devices);
637
638 sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
639 sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
640
641 sb->level = cpu_to_le32(mddev->level);
642 sb->layout = cpu_to_le32(mddev->layout);
643 sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
644}
645
646/*
647 * super_load
648 *
649 * This function creates a superblock if one is not found on the device
650 * and will decide which superblock to use if there's a choice.
651 *
652 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
653 */
654static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
655{
656 int ret;
657 struct dm_raid_superblock *sb;
658 struct dm_raid_superblock *refsb;
659 uint64_t events_sb, events_refsb;
660
661 rdev->sb_start = 0;
662 rdev->sb_size = sizeof(*sb);
663
664 ret = read_disk_sb(rdev, rdev->sb_size);
665 if (ret)
666 return ret;
667
668 sb = page_address(rdev->sb_page);
669 if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
670 super_sync(rdev->mddev, rdev);
671
672 set_bit(FirstUse, &rdev->flags);
673
674 /* Force writing of superblocks to disk */
675 set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
676
677 /* Any superblock is better than none, choose that if given */
678 return refdev ? 0 : 1;
679 }
680
681 if (!refdev)
682 return 1;
683
684 events_sb = le64_to_cpu(sb->events);
685
686 refsb = page_address(refdev->sb_page);
687 events_refsb = le64_to_cpu(refsb->events);
688
689 return (events_sb > events_refsb) ? 1 : 0;
690}
691
692static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
693{
694 int role;
695 struct raid_set *rs = container_of(mddev, struct raid_set, md);
696 uint64_t events_sb;
697 uint64_t failed_devices;
698 struct dm_raid_superblock *sb;
699 uint32_t new_devs = 0;
700 uint32_t rebuilds = 0;
701 mdk_rdev_t *r, *t;
702 struct dm_raid_superblock *sb2;
703
704 sb = page_address(rdev->sb_page);
705 events_sb = le64_to_cpu(sb->events);
706 failed_devices = le64_to_cpu(sb->failed_devices);
707
708 /*
709 * Initialise to 1 if this is a new superblock.
710 */
711 mddev->events = events_sb ? : 1;
712
713 /*
714 * Reshaping is not currently allowed
715 */
716 if ((le32_to_cpu(sb->level) != mddev->level) ||
717 (le32_to_cpu(sb->layout) != mddev->layout) ||
718 (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
719 DMERR("Reshaping arrays not yet supported.");
720 return -EINVAL;
721 }
722
723 /* We can only change the number of devices in RAID1 right now */
724 if ((rs->raid_type->level != 1) &&
725 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
726 DMERR("Reshaping arrays not yet supported.");
727 return -EINVAL;
728 }
729
730 if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
731 mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
732
733 /*
734 * During load, we set FirstUse if a new superblock was written.
735 * There are two reasons we might not have a superblock:
736 * 1) The array is brand new - in which case, all of the
737 * devices must have their In_sync bit set. Also,
738 * recovery_cp must be 0, unless forced.
739 * 2) This is a new device being added to an old array
740 * and the new device needs to be rebuilt - in which
741 * case the In_sync bit will /not/ be set and
742 * recovery_cp must be MaxSector.
743 */
744 rdev_for_each(r, t, mddev) {
745 if (!test_bit(In_sync, &r->flags)) {
746 if (!test_bit(FirstUse, &r->flags))
747 DMERR("Superblock area of "
748 "rebuild device %d should have been "
749 "cleared.", r->raid_disk);
750 set_bit(FirstUse, &r->flags);
751 rebuilds++;
752 } else if (test_bit(FirstUse, &r->flags))
753 new_devs++;
754 }
755
756 if (!rebuilds) {
757 if (new_devs == mddev->raid_disks) {
758 DMINFO("Superblocks created for new array");
759 set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
760 } else if (new_devs) {
761 DMERR("New device injected "
762 "into existing array without 'rebuild' "
763 "parameter specified");
764 return -EINVAL;
765 }
766 } else if (new_devs) {
767 DMERR("'rebuild' devices cannot be "
768 "injected into an array with other first-time devices");
769 return -EINVAL;
770 } else if (mddev->recovery_cp != MaxSector) {
771 DMERR("'rebuild' specified while array is not in-sync");
772 return -EINVAL;
773 }
774
775 /*
776 * Now we set the Faulty bit for those devices that are
777 * recorded in the superblock as failed.
778 */
779 rdev_for_each(r, t, mddev) {
780 if (!r->sb_page)
781 continue;
782 sb2 = page_address(r->sb_page);
783 sb2->failed_devices = 0;
784
785 /*
786 * Check for any device re-ordering.
787 */
788 if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
789 role = le32_to_cpu(sb2->array_position);
790 if (role != r->raid_disk) {
791 if (rs->raid_type->level != 1) {
792 rs->ti->error = "Cannot change device "
793 "positions in RAID array";
794 return -EINVAL;
795 }
796 DMINFO("RAID1 device #%d now at position #%d",
797 role, r->raid_disk);
798 }
799
800 /*
801 * Partial recovery is performed on
802 * returning failed devices.
803 */
804 if (failed_devices & (1 << role))
805 set_bit(Faulty, &r->flags);
806 }
807 }
808
809 return 0;
810}
811
812static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
813{
814 struct dm_raid_superblock *sb = page_address(rdev->sb_page);
815
816 /*
817 * If mddev->events is not set, we know we have not yet initialized
818 * the array.
819 */
820 if (!mddev->events && super_init_validation(mddev, rdev))
821 return -EINVAL;
822
823 mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
824 rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
825 if (!test_bit(FirstUse, &rdev->flags)) {
826 rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
827 if (rdev->recovery_offset != MaxSector)
828 clear_bit(In_sync, &rdev->flags);
829 }
830
831 /*
832 * If a device comes back, set it as not In_sync and no longer faulty.
833 */
834 if (test_bit(Faulty, &rdev->flags)) {
835 clear_bit(Faulty, &rdev->flags);
836 clear_bit(In_sync, &rdev->flags);
837 rdev->saved_raid_disk = rdev->raid_disk;
838 rdev->recovery_offset = 0;
839 }
840
841 clear_bit(FirstUse, &rdev->flags);
842
843 return 0;
844}
845
846/*
847 * Analyse superblocks and select the freshest.
848 */
849static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
850{
851 int ret;
852 mdk_rdev_t *rdev, *freshest, *tmp;
853 mddev_t *mddev = &rs->md;
854
855 freshest = NULL;
856 rdev_for_each(rdev, tmp, mddev) {
857 if (!rdev->meta_bdev)
858 continue;
859
860 ret = super_load(rdev, freshest);
861
862 switch (ret) {
863 case 1:
864 freshest = rdev;
865 break;
866 case 0:
867 break;
868 default:
869 ti->error = "Failed to load superblock";
870 return ret;
871 }
872 }
873
874 if (!freshest)
875 return 0;
876
877 /*
878 * Validation of the freshest device provides the source of
879 * validation for the remaining devices.
880 */
881 ti->error = "Unable to assemble array: Invalid superblocks";
882 if (super_validate(mddev, freshest))
883 return -EINVAL;
884
885 rdev_for_each(rdev, tmp, mddev)
886 if ((rdev != freshest) && super_validate(mddev, rdev))
887 return -EINVAL;
888
889 return 0;
890}
891
892/*
394 * Construct a RAID4/5/6 mapping: 893 * Construct a RAID4/5/6 mapping:
395 * Args: 894 * Args:
396 * <raid_type> <#raid_params> <raid_params> \ 895 * <raid_type> <#raid_params> <raid_params> \
397 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } 896 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
398 * 897 *
399 * ** metadata devices are not supported yet, use '-' instead **
400 *
401 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for 898 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
402 * details on possible <raid_params>. 899 * details on possible <raid_params>.
403 */ 900 */
@@ -465,8 +962,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
465 if (ret) 962 if (ret)
466 goto bad; 963 goto bad;
467 964
965 rs->md.sync_super = super_sync;
966 ret = analyse_superblocks(ti, rs);
967 if (ret)
968 goto bad;
969
468 INIT_WORK(&rs->md.event_work, do_table_event); 970 INIT_WORK(&rs->md.event_work, do_table_event);
469 ti->split_io = rs->md.chunk_sectors;
470 ti->private = rs; 971 ti->private = rs;
471 972
472 mutex_lock(&rs->md.reconfig_mutex); 973 mutex_lock(&rs->md.reconfig_mutex);
@@ -482,6 +983,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
482 rs->callbacks.congested_fn = raid_is_congested; 983 rs->callbacks.congested_fn = raid_is_congested;
483 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 984 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
484 985
986 mddev_suspend(&rs->md);
485 return 0; 987 return 0;
486 988
487bad: 989bad:
@@ -546,12 +1048,17 @@ static int raid_status(struct dm_target *ti, status_type_t type,
546 break; 1048 break;
547 case STATUSTYPE_TABLE: 1049 case STATUSTYPE_TABLE:
548 /* The string you would use to construct this array */ 1050 /* The string you would use to construct this array */
549 for (i = 0; i < rs->md.raid_disks; i++) 1051 for (i = 0; i < rs->md.raid_disks; i++) {
550 if (rs->dev[i].data_dev && 1052 if ((rs->print_flags & DMPF_REBUILD) &&
1053 rs->dev[i].data_dev &&
551 !test_bit(In_sync, &rs->dev[i].rdev.flags)) 1054 !test_bit(In_sync, &rs->dev[i].rdev.flags))
552 raid_param_cnt++; /* for rebuilds */ 1055 raid_param_cnt += 2; /* for rebuilds */
1056 if (rs->dev[i].data_dev &&
1057 test_bit(WriteMostly, &rs->dev[i].rdev.flags))
1058 raid_param_cnt += 2;
1059 }
553 1060
554 raid_param_cnt += (hweight64(rs->print_flags) * 2); 1061 raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
555 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) 1062 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
556 raid_param_cnt--; 1063 raid_param_cnt--;
557 1064
@@ -565,7 +1072,8 @@ static int raid_status(struct dm_target *ti, status_type_t type,
565 DMEMIT(" nosync"); 1072 DMEMIT(" nosync");
566 1073
567 for (i = 0; i < rs->md.raid_disks; i++) 1074 for (i = 0; i < rs->md.raid_disks; i++)
568 if (rs->dev[i].data_dev && 1075 if ((rs->print_flags & DMPF_REBUILD) &&
1076 rs->dev[i].data_dev &&
569 !test_bit(In_sync, &rs->dev[i].rdev.flags)) 1077 !test_bit(In_sync, &rs->dev[i].rdev.flags))
570 DMEMIT(" rebuild %u", i); 1078 DMEMIT(" rebuild %u", i);
571 1079
@@ -579,6 +1087,11 @@ static int raid_status(struct dm_target *ti, status_type_t type,
579 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) 1087 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
580 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); 1088 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
581 1089
1090 for (i = 0; i < rs->md.raid_disks; i++)
1091 if (rs->dev[i].data_dev &&
1092 test_bit(WriteMostly, &rs->dev[i].rdev.flags))
1093 DMEMIT(" write_mostly %u", i);
1094
582 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) 1095 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
583 DMEMIT(" max_write_behind %lu", 1096 DMEMIT(" max_write_behind %lu",
584 rs->md.bitmap_info.max_write_behind); 1097 rs->md.bitmap_info.max_write_behind);
@@ -591,9 +1104,16 @@ static int raid_status(struct dm_target *ti, status_type_t type,
591 conf ? conf->max_nr_stripes * 2 : 0); 1104 conf ? conf->max_nr_stripes * 2 : 0);
592 } 1105 }
593 1106
1107 if (rs->print_flags & DMPF_REGION_SIZE)
1108 DMEMIT(" region_size %lu",
1109 rs->md.bitmap_info.chunksize >> 9);
1110
594 DMEMIT(" %d", rs->md.raid_disks); 1111 DMEMIT(" %d", rs->md.raid_disks);
595 for (i = 0; i < rs->md.raid_disks; i++) { 1112 for (i = 0; i < rs->md.raid_disks; i++) {
596 DMEMIT(" -"); /* metadata device */ 1113 if (rs->dev[i].meta_dev)
1114 DMEMIT(" %s", rs->dev[i].meta_dev->name);
1115 else
1116 DMEMIT(" -");
597 1117
598 if (rs->dev[i].data_dev) 1118 if (rs->dev[i].data_dev)
599 DMEMIT(" %s", rs->dev[i].data_dev->name); 1119 DMEMIT(" %s", rs->dev[i].data_dev->name);
@@ -650,12 +1170,13 @@ static void raid_resume(struct dm_target *ti)
650{ 1170{
651 struct raid_set *rs = ti->private; 1171 struct raid_set *rs = ti->private;
652 1172
1173 bitmap_load(&rs->md);
653 mddev_resume(&rs->md); 1174 mddev_resume(&rs->md);
654} 1175}
655 1176
656static struct target_type raid_target = { 1177static struct target_type raid_target = {
657 .name = "raid", 1178 .name = "raid",
658 .version = {1, 0, 0}, 1179 .version = {1, 1, 0},
659 .module = THIS_MODULE, 1180 .module = THIS_MODULE,
660 .ctr = raid_ctr, 1181 .ctr = raid_ctr,
661 .dtr = raid_dtr, 1182 .dtr = raid_dtr,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 135c2f1fdbfc..d1f1d7017103 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -58,25 +58,30 @@
58#define NUM_SNAPSHOT_HDR_CHUNKS 1 58#define NUM_SNAPSHOT_HDR_CHUNKS 1
59 59
60struct disk_header { 60struct disk_header {
61 uint32_t magic; 61 __le32 magic;
62 62
63 /* 63 /*
64 * Is this snapshot valid. There is no way of recovering 64 * Is this snapshot valid. There is no way of recovering
65 * an invalid snapshot. 65 * an invalid snapshot.
66 */ 66 */
67 uint32_t valid; 67 __le32 valid;
68 68
69 /* 69 /*
70 * Simple, incrementing version. no backward 70 * Simple, incrementing version. no backward
71 * compatibility. 71 * compatibility.
72 */ 72 */
73 uint32_t version; 73 __le32 version;
74 74
75 /* In sectors */ 75 /* In sectors */
76 uint32_t chunk_size; 76 __le32 chunk_size;
77}; 77} __packed;
78 78
79struct disk_exception { 79struct disk_exception {
80 __le64 old_chunk;
81 __le64 new_chunk;
82} __packed;
83
84struct core_exception {
80 uint64_t old_chunk; 85 uint64_t old_chunk;
81 uint64_t new_chunk; 86 uint64_t new_chunk;
82}; 87};
@@ -169,10 +174,9 @@ static int alloc_area(struct pstore *ps)
169 if (!ps->area) 174 if (!ps->area)
170 goto err_area; 175 goto err_area;
171 176
172 ps->zero_area = vmalloc(len); 177 ps->zero_area = vzalloc(len);
173 if (!ps->zero_area) 178 if (!ps->zero_area)
174 goto err_zero_area; 179 goto err_zero_area;
175 memset(ps->zero_area, 0, len);
176 180
177 ps->header_area = vmalloc(len); 181 ps->header_area = vmalloc(len);
178 if (!ps->header_area) 182 if (!ps->header_area)
@@ -396,32 +400,32 @@ static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
396} 400}
397 401
398static void read_exception(struct pstore *ps, 402static void read_exception(struct pstore *ps,
399 uint32_t index, struct disk_exception *result) 403 uint32_t index, struct core_exception *result)
400{ 404{
401 struct disk_exception *e = get_exception(ps, index); 405 struct disk_exception *de = get_exception(ps, index);
402 406
403 /* copy it */ 407 /* copy it */
404 result->old_chunk = le64_to_cpu(e->old_chunk); 408 result->old_chunk = le64_to_cpu(de->old_chunk);
405 result->new_chunk = le64_to_cpu(e->new_chunk); 409 result->new_chunk = le64_to_cpu(de->new_chunk);
406} 410}
407 411
408static void write_exception(struct pstore *ps, 412static void write_exception(struct pstore *ps,
409 uint32_t index, struct disk_exception *de) 413 uint32_t index, struct core_exception *e)
410{ 414{
411 struct disk_exception *e = get_exception(ps, index); 415 struct disk_exception *de = get_exception(ps, index);
412 416
413 /* copy it */ 417 /* copy it */
414 e->old_chunk = cpu_to_le64(de->old_chunk); 418 de->old_chunk = cpu_to_le64(e->old_chunk);
415 e->new_chunk = cpu_to_le64(de->new_chunk); 419 de->new_chunk = cpu_to_le64(e->new_chunk);
416} 420}
417 421
418static void clear_exception(struct pstore *ps, uint32_t index) 422static void clear_exception(struct pstore *ps, uint32_t index)
419{ 423{
420 struct disk_exception *e = get_exception(ps, index); 424 struct disk_exception *de = get_exception(ps, index);
421 425
422 /* clear it */ 426 /* clear it */
423 e->old_chunk = 0; 427 de->old_chunk = 0;
424 e->new_chunk = 0; 428 de->new_chunk = 0;
425} 429}
426 430
427/* 431/*
@@ -437,13 +441,13 @@ static int insert_exceptions(struct pstore *ps,
437{ 441{
438 int r; 442 int r;
439 unsigned int i; 443 unsigned int i;
440 struct disk_exception de; 444 struct core_exception e;
441 445
442 /* presume the area is full */ 446 /* presume the area is full */
443 *full = 1; 447 *full = 1;
444 448
445 for (i = 0; i < ps->exceptions_per_area; i++) { 449 for (i = 0; i < ps->exceptions_per_area; i++) {
446 read_exception(ps, i, &de); 450 read_exception(ps, i, &e);
447 451
448 /* 452 /*
449 * If the new_chunk is pointing at the start of 453 * If the new_chunk is pointing at the start of
@@ -451,7 +455,7 @@ static int insert_exceptions(struct pstore *ps,
451 * is we know that we've hit the end of the 455 * is we know that we've hit the end of the
452 * exceptions. Therefore the area is not full. 456 * exceptions. Therefore the area is not full.
453 */ 457 */
454 if (de.new_chunk == 0LL) { 458 if (e.new_chunk == 0LL) {
455 ps->current_committed = i; 459 ps->current_committed = i;
456 *full = 0; 460 *full = 0;
457 break; 461 break;
@@ -460,13 +464,13 @@ static int insert_exceptions(struct pstore *ps,
460 /* 464 /*
461 * Keep track of the start of the free chunks. 465 * Keep track of the start of the free chunks.
462 */ 466 */
463 if (ps->next_free <= de.new_chunk) 467 if (ps->next_free <= e.new_chunk)
464 ps->next_free = de.new_chunk + 1; 468 ps->next_free = e.new_chunk + 1;
465 469
466 /* 470 /*
467 * Otherwise we add the exception to the snapshot. 471 * Otherwise we add the exception to the snapshot.
468 */ 472 */
469 r = callback(callback_context, de.old_chunk, de.new_chunk); 473 r = callback(callback_context, e.old_chunk, e.new_chunk);
470 if (r) 474 if (r)
471 return r; 475 return r;
472 } 476 }
@@ -563,7 +567,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
563 ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / 567 ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
564 sizeof(struct disk_exception); 568 sizeof(struct disk_exception);
565 ps->callbacks = dm_vcalloc(ps->exceptions_per_area, 569 ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
566 sizeof(*ps->callbacks)); 570 sizeof(*ps->callbacks));
567 if (!ps->callbacks) 571 if (!ps->callbacks)
568 return -ENOMEM; 572 return -ENOMEM;
569 573
@@ -641,12 +645,12 @@ static void persistent_commit_exception(struct dm_exception_store *store,
641{ 645{
642 unsigned int i; 646 unsigned int i;
643 struct pstore *ps = get_info(store); 647 struct pstore *ps = get_info(store);
644 struct disk_exception de; 648 struct core_exception ce;
645 struct commit_callback *cb; 649 struct commit_callback *cb;
646 650
647 de.old_chunk = e->old_chunk; 651 ce.old_chunk = e->old_chunk;
648 de.new_chunk = e->new_chunk; 652 ce.new_chunk = e->new_chunk;
649 write_exception(ps, ps->current_committed++, &de); 653 write_exception(ps, ps->current_committed++, &ce);
650 654
651 /* 655 /*
652 * Add the callback to the back of the array. This code 656 * Add the callback to the back of the array. This code
@@ -670,7 +674,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
670 * If we completely filled the current area, then wipe the next one. 674 * If we completely filled the current area, then wipe the next one.
671 */ 675 */
672 if ((ps->current_committed == ps->exceptions_per_area) && 676 if ((ps->current_committed == ps->exceptions_per_area) &&
673 zero_disk_area(ps, ps->current_area + 1)) 677 zero_disk_area(ps, ps->current_area + 1))
674 ps->valid = 0; 678 ps->valid = 0;
675 679
676 /* 680 /*
@@ -701,7 +705,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
701 chunk_t *last_new_chunk) 705 chunk_t *last_new_chunk)
702{ 706{
703 struct pstore *ps = get_info(store); 707 struct pstore *ps = get_info(store);
704 struct disk_exception de; 708 struct core_exception ce;
705 int nr_consecutive; 709 int nr_consecutive;
706 int r; 710 int r;
707 711
@@ -722,9 +726,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
722 ps->current_committed = ps->exceptions_per_area; 726 ps->current_committed = ps->exceptions_per_area;
723 } 727 }
724 728
725 read_exception(ps, ps->current_committed - 1, &de); 729 read_exception(ps, ps->current_committed - 1, &ce);
726 *last_old_chunk = de.old_chunk; 730 *last_old_chunk = ce.old_chunk;
727 *last_new_chunk = de.new_chunk; 731 *last_new_chunk = ce.new_chunk;
728 732
729 /* 733 /*
730 * Find number of consecutive chunks within the current area, 734 * Find number of consecutive chunks within the current area,
@@ -733,9 +737,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
733 for (nr_consecutive = 1; nr_consecutive < ps->current_committed; 737 for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
734 nr_consecutive++) { 738 nr_consecutive++) {
735 read_exception(ps, ps->current_committed - 1 - nr_consecutive, 739 read_exception(ps, ps->current_committed - 1 - nr_consecutive,
736 &de); 740 &ce);
737 if (de.old_chunk != *last_old_chunk - nr_consecutive || 741 if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
738 de.new_chunk != *last_new_chunk - nr_consecutive) 742 ce.new_chunk != *last_new_chunk - nr_consecutive)
739 break; 743 break;
740 } 744 }
741 745
@@ -753,7 +757,7 @@ static int persistent_commit_merge(struct dm_exception_store *store,
753 for (i = 0; i < nr_merged; i++) 757 for (i = 0; i < nr_merged; i++)
754 clear_exception(ps, ps->current_committed - 1 - i); 758 clear_exception(ps, ps->current_committed - 1 - i);
755 759
756 r = area_io(ps, WRITE); 760 r = area_io(ps, WRITE_FLUSH_FUA);
757 if (r < 0) 761 if (r < 0)
758 return r; 762 return r;
759 763
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 9ecff5f3023a..6f758870fc19 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -30,16 +30,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
30 ((ti)->type->name == dm_snapshot_merge_target_name) 30 ((ti)->type->name == dm_snapshot_merge_target_name)
31 31
32/* 32/*
33 * The percentage increment we will wake up users at
34 */
35#define WAKE_UP_PERCENT 5
36
37/*
38 * kcopyd priority of snapshot operations
39 */
40#define SNAPSHOT_COPY_PRIORITY 2
41
42/*
43 * The size of the mempool used to track chunks in use. 33 * The size of the mempool used to track chunks in use.
44 */ 34 */
45#define MIN_IOS 256 35#define MIN_IOS 256
@@ -180,6 +170,13 @@ struct dm_snap_pending_exception {
180 * kcopyd. 170 * kcopyd.
181 */ 171 */
182 int started; 172 int started;
173
174 /*
175 * For writing a complete chunk, bypassing the copy.
176 */
177 struct bio *full_bio;
178 bio_end_io_t *full_bio_end_io;
179 void *full_bio_private;
183}; 180};
184 181
185/* 182/*
@@ -1055,8 +1052,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1055 1052
1056 s = kmalloc(sizeof(*s), GFP_KERNEL); 1053 s = kmalloc(sizeof(*s), GFP_KERNEL);
1057 if (!s) { 1054 if (!s) {
1058 ti->error = "Cannot allocate snapshot context private " 1055 ti->error = "Cannot allocate private snapshot structure";
1059 "structure";
1060 r = -ENOMEM; 1056 r = -ENOMEM;
1061 goto bad; 1057 goto bad;
1062 } 1058 }
@@ -1380,6 +1376,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1380 struct dm_snapshot *s = pe->snap; 1376 struct dm_snapshot *s = pe->snap;
1381 struct bio *origin_bios = NULL; 1377 struct bio *origin_bios = NULL;
1382 struct bio *snapshot_bios = NULL; 1378 struct bio *snapshot_bios = NULL;
1379 struct bio *full_bio = NULL;
1383 int error = 0; 1380 int error = 0;
1384 1381
1385 if (!success) { 1382 if (!success) {
@@ -1415,10 +1412,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1415 */ 1412 */
1416 dm_insert_exception(&s->complete, e); 1413 dm_insert_exception(&s->complete, e);
1417 1414
1418 out: 1415out:
1419 dm_remove_exception(&pe->e); 1416 dm_remove_exception(&pe->e);
1420 snapshot_bios = bio_list_get(&pe->snapshot_bios); 1417 snapshot_bios = bio_list_get(&pe->snapshot_bios);
1421 origin_bios = bio_list_get(&pe->origin_bios); 1418 origin_bios = bio_list_get(&pe->origin_bios);
1419 full_bio = pe->full_bio;
1420 if (full_bio) {
1421 full_bio->bi_end_io = pe->full_bio_end_io;
1422 full_bio->bi_private = pe->full_bio_private;
1423 }
1422 free_pending_exception(pe); 1424 free_pending_exception(pe);
1423 1425
1424 increment_pending_exceptions_done_count(); 1426 increment_pending_exceptions_done_count();
@@ -1426,10 +1428,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1426 up_write(&s->lock); 1428 up_write(&s->lock);
1427 1429
1428 /* Submit any pending write bios */ 1430 /* Submit any pending write bios */
1429 if (error) 1431 if (error) {
1432 if (full_bio)
1433 bio_io_error(full_bio);
1430 error_bios(snapshot_bios); 1434 error_bios(snapshot_bios);
1431 else 1435 } else {
1436 if (full_bio)
1437 bio_endio(full_bio, 0);
1432 flush_bios(snapshot_bios); 1438 flush_bios(snapshot_bios);
1439 }
1433 1440
1434 retry_origin_bios(s, origin_bios); 1441 retry_origin_bios(s, origin_bios);
1435} 1442}
@@ -1480,8 +1487,33 @@ static void start_copy(struct dm_snap_pending_exception *pe)
1480 dest.count = src.count; 1487 dest.count = src.count;
1481 1488
1482 /* Hand over to kcopyd */ 1489 /* Hand over to kcopyd */
1483 dm_kcopyd_copy(s->kcopyd_client, 1490 dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
1484 &src, 1, &dest, 0, copy_callback, pe); 1491}
1492
1493static void full_bio_end_io(struct bio *bio, int error)
1494{
1495 void *callback_data = bio->bi_private;
1496
1497 dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0);
1498}
1499
1500static void start_full_bio(struct dm_snap_pending_exception *pe,
1501 struct bio *bio)
1502{
1503 struct dm_snapshot *s = pe->snap;
1504 void *callback_data;
1505
1506 pe->full_bio = bio;
1507 pe->full_bio_end_io = bio->bi_end_io;
1508 pe->full_bio_private = bio->bi_private;
1509
1510 callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
1511 copy_callback, pe);
1512
1513 bio->bi_end_io = full_bio_end_io;
1514 bio->bi_private = callback_data;
1515
1516 generic_make_request(bio);
1485} 1517}
1486 1518
1487static struct dm_snap_pending_exception * 1519static struct dm_snap_pending_exception *
@@ -1519,6 +1551,7 @@ __find_pending_exception(struct dm_snapshot *s,
1519 bio_list_init(&pe->origin_bios); 1551 bio_list_init(&pe->origin_bios);
1520 bio_list_init(&pe->snapshot_bios); 1552 bio_list_init(&pe->snapshot_bios);
1521 pe->started = 0; 1553 pe->started = 0;
1554 pe->full_bio = NULL;
1522 1555
1523 if (s->store->type->prepare_exception(s->store, &pe->e)) { 1556 if (s->store->type->prepare_exception(s->store, &pe->e)) {
1524 free_pending_exception(pe); 1557 free_pending_exception(pe);
@@ -1612,10 +1645,19 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1612 } 1645 }
1613 1646
1614 remap_exception(s, &pe->e, bio, chunk); 1647 remap_exception(s, &pe->e, bio, chunk);
1615 bio_list_add(&pe->snapshot_bios, bio);
1616 1648
1617 r = DM_MAPIO_SUBMITTED; 1649 r = DM_MAPIO_SUBMITTED;
1618 1650
1651 if (!pe->started &&
1652 bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
1653 pe->started = 1;
1654 up_write(&s->lock);
1655 start_full_bio(pe, bio);
1656 goto out;
1657 }
1658
1659 bio_list_add(&pe->snapshot_bios, bio);
1660
1619 if (!pe->started) { 1661 if (!pe->started) {
1620 /* this is protected by snap->lock */ 1662 /* this is protected by snap->lock */
1621 pe->started = 1; 1663 pe->started = 1;
@@ -1628,9 +1670,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1628 map_context->ptr = track_chunk(s, chunk); 1670 map_context->ptr = track_chunk(s, chunk);
1629 } 1671 }
1630 1672
1631 out_unlock: 1673out_unlock:
1632 up_write(&s->lock); 1674 up_write(&s->lock);
1633 out: 1675out:
1634 return r; 1676 return r;
1635} 1677}
1636 1678
@@ -1974,7 +2016,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
1974 pe_to_start_now = pe; 2016 pe_to_start_now = pe;
1975 } 2017 }
1976 2018
1977 next_snapshot: 2019next_snapshot:
1978 up_write(&snap->lock); 2020 up_write(&snap->lock);
1979 2021
1980 if (pe_to_start_now) { 2022 if (pe_to_start_now) {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index bfe9c2333cea..986b8754bb08 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -54,7 +54,6 @@ struct dm_table {
54 sector_t *highs; 54 sector_t *highs;
55 struct dm_target *targets; 55 struct dm_target *targets;
56 56
57 unsigned discards_supported:1;
58 unsigned integrity_supported:1; 57 unsigned integrity_supported:1;
59 58
60 /* 59 /*
@@ -154,12 +153,11 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
154 return NULL; 153 return NULL;
155 154
156 size = nmemb * elem_size; 155 size = nmemb * elem_size;
157 addr = vmalloc(size); 156 addr = vzalloc(size);
158 if (addr)
159 memset(addr, 0, size);
160 157
161 return addr; 158 return addr;
162} 159}
160EXPORT_SYMBOL(dm_vcalloc);
163 161
164/* 162/*
165 * highs, and targets are managed as dynamic arrays during a 163 * highs, and targets are managed as dynamic arrays during a
@@ -209,7 +207,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
209 INIT_LIST_HEAD(&t->devices); 207 INIT_LIST_HEAD(&t->devices);
210 INIT_LIST_HEAD(&t->target_callbacks); 208 INIT_LIST_HEAD(&t->target_callbacks);
211 atomic_set(&t->holders, 0); 209 atomic_set(&t->holders, 0);
212 t->discards_supported = 1;
213 210
214 if (!num_targets) 211 if (!num_targets)
215 num_targets = KEYS_PER_NODE; 212 num_targets = KEYS_PER_NODE;
@@ -281,6 +278,7 @@ void dm_table_get(struct dm_table *t)
281{ 278{
282 atomic_inc(&t->holders); 279 atomic_inc(&t->holders);
283} 280}
281EXPORT_SYMBOL(dm_table_get);
284 282
285void dm_table_put(struct dm_table *t) 283void dm_table_put(struct dm_table *t)
286{ 284{
@@ -290,6 +288,7 @@ void dm_table_put(struct dm_table *t)
290 smp_mb__before_atomic_dec(); 288 smp_mb__before_atomic_dec();
291 atomic_dec(&t->holders); 289 atomic_dec(&t->holders);
292} 290}
291EXPORT_SYMBOL(dm_table_put);
293 292
294/* 293/*
295 * Checks to see if we need to extend highs or targets. 294 * Checks to see if we need to extend highs or targets.
@@ -455,13 +454,14 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
455 * Add a device to the list, or just increment the usage count if 454 * Add a device to the list, or just increment the usage count if
456 * it's already present. 455 * it's already present.
457 */ 456 */
458static int __table_get_device(struct dm_table *t, struct dm_target *ti, 457int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
459 const char *path, fmode_t mode, struct dm_dev **result) 458 struct dm_dev **result)
460{ 459{
461 int r; 460 int r;
462 dev_t uninitialized_var(dev); 461 dev_t uninitialized_var(dev);
463 struct dm_dev_internal *dd; 462 struct dm_dev_internal *dd;
464 unsigned int major, minor; 463 unsigned int major, minor;
464 struct dm_table *t = ti->table;
465 465
466 BUG_ON(!t); 466 BUG_ON(!t);
467 467
@@ -509,6 +509,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
509 *result = &dd->dm_dev; 509 *result = &dd->dm_dev;
510 return 0; 510 return 0;
511} 511}
512EXPORT_SYMBOL(dm_get_device);
512 513
513int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, 514int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
514 sector_t start, sector_t len, void *data) 515 sector_t start, sector_t len, void *data)
@@ -539,23 +540,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
539 * If not we'll force DM to use PAGE_SIZE or 540 * If not we'll force DM to use PAGE_SIZE or
540 * smaller I/O, just to be safe. 541 * smaller I/O, just to be safe.
541 */ 542 */
542 543 if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
543 if (q->merge_bvec_fn && !ti->type->merge)
544 blk_limits_max_hw_sectors(limits, 544 blk_limits_max_hw_sectors(limits,
545 (unsigned int) (PAGE_SIZE >> 9)); 545 (unsigned int) (PAGE_SIZE >> 9));
546 return 0; 546 return 0;
547} 547}
548EXPORT_SYMBOL_GPL(dm_set_device_limits); 548EXPORT_SYMBOL_GPL(dm_set_device_limits);
549 549
550int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
551 struct dm_dev **result)
552{
553 return __table_get_device(ti->table, ti, path, mode, result);
554}
555
556
557/* 550/*
558 * Decrement a devices use count and remove it if necessary. 551 * Decrement a device's use count and remove it if necessary.
559 */ 552 */
560void dm_put_device(struct dm_target *ti, struct dm_dev *d) 553void dm_put_device(struct dm_target *ti, struct dm_dev *d)
561{ 554{
@@ -568,6 +561,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
568 kfree(dd); 561 kfree(dd);
569 } 562 }
570} 563}
564EXPORT_SYMBOL(dm_put_device);
571 565
572/* 566/*
573 * Checks to see if the target joins onto the end of the table. 567 * Checks to see if the target joins onto the end of the table.
@@ -791,8 +785,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
791 785
792 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; 786 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
793 787
794 if (!tgt->num_discard_requests) 788 if (!tgt->num_discard_requests && tgt->discards_supported)
795 t->discards_supported = 0; 789 DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.",
790 dm_device_name(t->md), type);
796 791
797 return 0; 792 return 0;
798 793
@@ -802,6 +797,63 @@ int dm_table_add_target(struct dm_table *t, const char *type,
802 return r; 797 return r;
803} 798}
804 799
800/*
801 * Target argument parsing helpers.
802 */
803static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
804 unsigned *value, char **error, unsigned grouped)
805{
806 const char *arg_str = dm_shift_arg(arg_set);
807
808 if (!arg_str ||
809 (sscanf(arg_str, "%u", value) != 1) ||
810 (*value < arg->min) ||
811 (*value > arg->max) ||
812 (grouped && arg_set->argc < *value)) {
813 *error = arg->error;
814 return -EINVAL;
815 }
816
817 return 0;
818}
819
820int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
821 unsigned *value, char **error)
822{
823 return validate_next_arg(arg, arg_set, value, error, 0);
824}
825EXPORT_SYMBOL(dm_read_arg);
826
827int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
828 unsigned *value, char **error)
829{
830 return validate_next_arg(arg, arg_set, value, error, 1);
831}
832EXPORT_SYMBOL(dm_read_arg_group);
833
834const char *dm_shift_arg(struct dm_arg_set *as)
835{
836 char *r;
837
838 if (as->argc) {
839 as->argc--;
840 r = *as->argv;
841 as->argv++;
842 return r;
843 }
844
845 return NULL;
846}
847EXPORT_SYMBOL(dm_shift_arg);
848
849void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
850{
851 BUG_ON(as->argc < num_args);
852 as->argc -= num_args;
853 as->argv += num_args;
854}
855EXPORT_SYMBOL(dm_consume_args);
856
805static int dm_table_set_type(struct dm_table *t) 857static int dm_table_set_type(struct dm_table *t)
806{ 858{
807 unsigned i; 859 unsigned i;
@@ -1077,11 +1129,13 @@ void dm_table_event(struct dm_table *t)
1077 t->event_fn(t->event_context); 1129 t->event_fn(t->event_context);
1078 mutex_unlock(&_event_lock); 1130 mutex_unlock(&_event_lock);
1079} 1131}
1132EXPORT_SYMBOL(dm_table_event);
1080 1133
1081sector_t dm_table_get_size(struct dm_table *t) 1134sector_t dm_table_get_size(struct dm_table *t)
1082{ 1135{
1083 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; 1136 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
1084} 1137}
1138EXPORT_SYMBOL(dm_table_get_size);
1085 1139
1086struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) 1140struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
1087{ 1141{
@@ -1194,9 +1248,45 @@ static void dm_table_set_integrity(struct dm_table *t)
1194 blk_get_integrity(template_disk)); 1248 blk_get_integrity(template_disk));
1195} 1249}
1196 1250
1251static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
1252 sector_t start, sector_t len, void *data)
1253{
1254 unsigned flush = (*(unsigned *)data);
1255 struct request_queue *q = bdev_get_queue(dev->bdev);
1256
1257 return q && (q->flush_flags & flush);
1258}
1259
1260static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
1261{
1262 struct dm_target *ti;
1263 unsigned i = 0;
1264
1265 /*
1266 * Require at least one underlying device to support flushes.
1267 * t->devices includes internal dm devices such as mirror logs
1268 * so we need to use iterate_devices here, which targets
1269 * supporting flushes must provide.
1270 */
1271 while (i < dm_table_get_num_targets(t)) {
1272 ti = dm_table_get_target(t, i++);
1273
1274 if (!ti->num_flush_requests)
1275 continue;
1276
1277 if (ti->type->iterate_devices &&
1278 ti->type->iterate_devices(ti, device_flush_capable, &flush))
1279 return 1;
1280 }
1281
1282 return 0;
1283}
1284
1197void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 1285void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1198 struct queue_limits *limits) 1286 struct queue_limits *limits)
1199{ 1287{
1288 unsigned flush = 0;
1289
1200 /* 1290 /*
1201 * Copy table's limits to the DM device's request_queue 1291 * Copy table's limits to the DM device's request_queue
1202 */ 1292 */
@@ -1207,6 +1297,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1207 else 1297 else
1208 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 1298 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
1209 1299
1300 if (dm_table_supports_flush(t, REQ_FLUSH)) {
1301 flush |= REQ_FLUSH;
1302 if (dm_table_supports_flush(t, REQ_FUA))
1303 flush |= REQ_FUA;
1304 }
1305 blk_queue_flush(q, flush);
1306
1210 dm_table_set_integrity(t); 1307 dm_table_set_integrity(t);
1211 1308
1212 /* 1309 /*
@@ -1237,6 +1334,7 @@ fmode_t dm_table_get_mode(struct dm_table *t)
1237{ 1334{
1238 return t->mode; 1335 return t->mode;
1239} 1336}
1337EXPORT_SYMBOL(dm_table_get_mode);
1240 1338
1241static void suspend_targets(struct dm_table *t, unsigned postsuspend) 1339static void suspend_targets(struct dm_table *t, unsigned postsuspend)
1242{ 1340{
@@ -1345,6 +1443,7 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
1345{ 1443{
1346 return t->md; 1444 return t->md;
1347} 1445}
1446EXPORT_SYMBOL(dm_table_get_md);
1348 1447
1349static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, 1448static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
1350 sector_t start, sector_t len, void *data) 1449 sector_t start, sector_t len, void *data)
@@ -1359,19 +1458,19 @@ bool dm_table_supports_discards(struct dm_table *t)
1359 struct dm_target *ti; 1458 struct dm_target *ti;
1360 unsigned i = 0; 1459 unsigned i = 0;
1361 1460
1362 if (!t->discards_supported)
1363 return 0;
1364
1365 /* 1461 /*
1366 * Unless any target used by the table set discards_supported, 1462 * Unless any target used by the table set discards_supported,
1367 * require at least one underlying device to support discards. 1463 * require at least one underlying device to support discards.
1368 * t->devices includes internal dm devices such as mirror logs 1464 * t->devices includes internal dm devices such as mirror logs
1369 * so we need to use iterate_devices here, which targets 1465 * so we need to use iterate_devices here, which targets
1370 * supporting discard must provide. 1466 * supporting discard selectively must provide.
1371 */ 1467 */
1372 while (i < dm_table_get_num_targets(t)) { 1468 while (i < dm_table_get_num_targets(t)) {
1373 ti = dm_table_get_target(t, i++); 1469 ti = dm_table_get_target(t, i++);
1374 1470
1471 if (!ti->num_discard_requests)
1472 continue;
1473
1375 if (ti->discards_supported) 1474 if (ti->discards_supported)
1376 return 1; 1475 return 1;
1377 1476
@@ -1382,13 +1481,3 @@ bool dm_table_supports_discards(struct dm_table *t)
1382 1481
1383 return 0; 1482 return 0;
1384} 1483}
1385
1386EXPORT_SYMBOL(dm_vcalloc);
1387EXPORT_SYMBOL(dm_get_device);
1388EXPORT_SYMBOL(dm_put_device);
1389EXPORT_SYMBOL(dm_table_event);
1390EXPORT_SYMBOL(dm_table_get_size);
1391EXPORT_SYMBOL(dm_table_get_mode);
1392EXPORT_SYMBOL(dm_table_get_md);
1393EXPORT_SYMBOL(dm_table_put);
1394EXPORT_SYMBOL(dm_table_get);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0cf68b478878..52b39f335bb3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -37,6 +37,8 @@ static const char *_name = DM_NAME;
37static unsigned int major = 0; 37static unsigned int major = 0;
38static unsigned int _major = 0; 38static unsigned int _major = 0;
39 39
40static DEFINE_IDR(_minor_idr);
41
40static DEFINE_SPINLOCK(_minor_lock); 42static DEFINE_SPINLOCK(_minor_lock);
41/* 43/*
42 * For bio-based dm. 44 * For bio-based dm.
@@ -109,6 +111,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
109#define DMF_FREEING 3 111#define DMF_FREEING 3
110#define DMF_DELETING 4 112#define DMF_DELETING 4
111#define DMF_NOFLUSH_SUSPENDING 5 113#define DMF_NOFLUSH_SUSPENDING 5
114#define DMF_MERGE_IS_OPTIONAL 6
112 115
113/* 116/*
114 * Work processed by per-device workqueue. 117 * Work processed by per-device workqueue.
@@ -313,6 +316,12 @@ static void __exit dm_exit(void)
313 316
314 while (i--) 317 while (i--)
315 _exits[i](); 318 _exits[i]();
319
320 /*
321 * Should be empty by this point.
322 */
323 idr_remove_all(&_minor_idr);
324 idr_destroy(&_minor_idr);
316} 325}
317 326
318/* 327/*
@@ -1171,7 +1180,8 @@ static int __clone_and_map_discard(struct clone_info *ci)
1171 1180
1172 /* 1181 /*
1173 * Even though the device advertised discard support, 1182 * Even though the device advertised discard support,
1174 * reconfiguration might have changed that since the 1183 * that does not mean every target supports it, and
1184 * reconfiguration might also have changed that since the
1175 * check was performed. 1185 * check was performed.
1176 */ 1186 */
1177 if (!ti->num_discard_requests) 1187 if (!ti->num_discard_requests)
@@ -1705,8 +1715,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
1705/*----------------------------------------------------------------- 1715/*-----------------------------------------------------------------
1706 * An IDR is used to keep track of allocated minor numbers. 1716 * An IDR is used to keep track of allocated minor numbers.
1707 *---------------------------------------------------------------*/ 1717 *---------------------------------------------------------------*/
1708static DEFINE_IDR(_minor_idr);
1709
1710static void free_minor(int minor) 1718static void free_minor(int minor)
1711{ 1719{
1712 spin_lock(&_minor_lock); 1720 spin_lock(&_minor_lock);
@@ -1800,7 +1808,6 @@ static void dm_init_md_queue(struct mapped_device *md)
1800 blk_queue_make_request(md->queue, dm_request); 1808 blk_queue_make_request(md->queue, dm_request);
1801 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1809 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1802 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1810 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1803 blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1804} 1811}
1805 1812
1806/* 1813/*
@@ -1986,6 +1993,59 @@ static void __set_size(struct mapped_device *md, sector_t size)
1986} 1993}
1987 1994
1988/* 1995/*
1996 * Return 1 if the queue has a compulsory merge_bvec_fn function.
1997 *
1998 * If this function returns 0, then the device is either a non-dm
1999 * device without a merge_bvec_fn, or it is a dm device that is
2000 * able to split any bios it receives that are too big.
2001 */
2002int dm_queue_merge_is_compulsory(struct request_queue *q)
2003{
2004 struct mapped_device *dev_md;
2005
2006 if (!q->merge_bvec_fn)
2007 return 0;
2008
2009 if (q->make_request_fn == dm_request) {
2010 dev_md = q->queuedata;
2011 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
2012 return 0;
2013 }
2014
2015 return 1;
2016}
2017
2018static int dm_device_merge_is_compulsory(struct dm_target *ti,
2019 struct dm_dev *dev, sector_t start,
2020 sector_t len, void *data)
2021{
2022 struct block_device *bdev = dev->bdev;
2023 struct request_queue *q = bdev_get_queue(bdev);
2024
2025 return dm_queue_merge_is_compulsory(q);
2026}
2027
2028/*
2029 * Return 1 if it is acceptable to ignore merge_bvec_fn based
2030 * on the properties of the underlying devices.
2031 */
2032static int dm_table_merge_is_optional(struct dm_table *table)
2033{
2034 unsigned i = 0;
2035 struct dm_target *ti;
2036
2037 while (i < dm_table_get_num_targets(table)) {
2038 ti = dm_table_get_target(table, i++);
2039
2040 if (ti->type->iterate_devices &&
2041 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
2042 return 0;
2043 }
2044
2045 return 1;
2046}
2047
2048/*
1989 * Returns old map, which caller must destroy. 2049 * Returns old map, which caller must destroy.
1990 */ 2050 */
1991static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2051static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
@@ -1995,6 +2055,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1995 struct request_queue *q = md->queue; 2055 struct request_queue *q = md->queue;
1996 sector_t size; 2056 sector_t size;
1997 unsigned long flags; 2057 unsigned long flags;
2058 int merge_is_optional;
1998 2059
1999 size = dm_table_get_size(t); 2060 size = dm_table_get_size(t);
2000 2061
@@ -2020,10 +2081,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2020 2081
2021 __bind_mempools(md, t); 2082 __bind_mempools(md, t);
2022 2083
2084 merge_is_optional = dm_table_merge_is_optional(t);
2085
2023 write_lock_irqsave(&md->map_lock, flags); 2086 write_lock_irqsave(&md->map_lock, flags);
2024 old_map = md->map; 2087 old_map = md->map;
2025 md->map = t; 2088 md->map = t;
2026 dm_table_set_restrictions(t, q, limits); 2089 dm_table_set_restrictions(t, q, limits);
2090 if (merge_is_optional)
2091 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2092 else
2093 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2027 write_unlock_irqrestore(&md->map_lock, flags); 2094 write_unlock_irqrestore(&md->map_lock, flags);
2028 2095
2029 return old_map; 2096 return old_map;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1aaf16746da8..6745dbd278a4 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -66,6 +66,8 @@ int dm_table_alloc_md_mempools(struct dm_table *t);
66void dm_table_free_md_mempools(struct dm_table *t); 66void dm_table_free_md_mempools(struct dm_table *t);
67struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); 67struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
68 68
69int dm_queue_merge_is_compulsory(struct request_queue *q);
70
69void dm_lock_md_type(struct mapped_device *md); 71void dm_lock_md_type(struct mapped_device *md);
70void dm_unlock_md_type(struct mapped_device *md); 72void dm_unlock_md_type(struct mapped_device *md);
71void dm_set_md_type(struct mapped_device *md, unsigned type); 73void dm_set_md_type(struct mapped_device *md, unsigned type);