aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-08-03 02:49:21 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-08-03 02:49:21 -0400
commitf3406816bb2486fc44558bec77179cd9bcbd4450 (patch)
tree718db1ef45e55314b5e7290f77e70e6328d855a4
parent4400478ba3d939b680810aa004f1e954b4f8ba16 (diff)
parented8b752bccf2560e305e25125721d2f0ac759e88 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm
* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (34 commits) dm table: set flush capability based on underlying devices dm crypt: optionally support discard requests dm raid: add md raid1 support dm raid: support metadata devices dm raid: add write_mostly parameter dm raid: add region_size parameter dm raid: improve table parameters documentation dm ioctl: forbid multiple device specifiers dm ioctl: introduce __get_dev_cell dm ioctl: fill in device parameters in more ioctls dm flakey: add corrupt_bio_byte feature dm flakey: add drop_writes dm flakey: support feature args dm flakey: use dm_target_offset and support discards dm table: share target argument parsing functions dm snapshot: skip reading origin when overwriting complete chunk dm: ignore merge_bvec for snapshots when safe dm table: clean dm_get_device and move exports dm raid: tidy includes dm ioctl: prevent empty message ...
-rw-r--r--Documentation/device-mapper/dm-crypt.txt21
-rw-r--r--Documentation/device-mapper/dm-flakey.txt48
-rw-r--r--Documentation/device-mapper/dm-raid.txt138
-rw-r--r--drivers/md/Kconfig5
-rw-r--r--drivers/md/dm-crypt.c62
-rw-r--r--drivers/md/dm-flakey.c270
-rw-r--r--drivers/md/dm-io.c29
-rw-r--r--drivers/md/dm-ioctl.c89
-rw-r--r--drivers/md/dm-kcopyd.c42
-rw-r--r--drivers/md/dm-log-userspace-base.c3
-rw-r--r--drivers/md/dm-log.c32
-rw-r--r--drivers/md/dm-mpath.c147
-rw-r--r--drivers/md/dm-raid.c621
-rw-r--r--drivers/md/dm-snap-persistent.c80
-rw-r--r--drivers/md/dm-snap.c84
-rw-r--r--drivers/md/dm-table.c155
-rw-r--r--drivers/md/dm.c75
-rw-r--r--drivers/md/dm.h2
-rw-r--r--include/linux/device-mapper.h43
-rw-r--r--include/linux/dm-ioctl.h4
-rw-r--r--include/linux/dm-kcopyd.h15
21 files changed, 1561 insertions, 404 deletions
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 6b5c42dbbe8..2c656ae43ba 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -4,7 +4,8 @@ dm-crypt
4Device-Mapper's "crypt" target provides transparent encryption of block devices 4Device-Mapper's "crypt" target provides transparent encryption of block devices
5using the kernel crypto API. 5using the kernel crypto API.
6 6
7Parameters: <cipher> <key> <iv_offset> <device path> <offset> 7Parameters: <cipher> <key> <iv_offset> <device path> \
8 <offset> [<#opt_params> <opt_params>]
8 9
9<cipher> 10<cipher>
10 Encryption cipher and an optional IV generation mode. 11 Encryption cipher and an optional IV generation mode.
@@ -37,6 +38,24 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
37<offset> 38<offset>
38 Starting sector within the device where the encrypted data begins. 39 Starting sector within the device where the encrypted data begins.
39 40
41<#opt_params>
42 Number of optional parameters. If there are no optional parameters,
43 the optional paramaters section can be skipped or #opt_params can be zero.
44 Otherwise #opt_params is the number of following arguments.
45
46 Example of optional parameters section:
47 1 allow_discards
48
49allow_discards
50 Block discard requests (a.k.a. TRIM) are passed through the crypt device.
51 The default is to ignore discard requests.
52
53 WARNING: Assess the specific security risks carefully before enabling this
54 option. For example, allowing discards on encrypted devices may lead to
55 the leak of information about the ciphertext device (filesystem type,
56 used space etc.) if the discarded blocks can be located easily on the
57 device later.
58
40Example scripts 59Example scripts
41=============== 60===============
42LUKS (Linux Unified Key Setup) is now the preferred way to set up disk 61LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt
index c8efdfd19a6..6ff5c232722 100644
--- a/Documentation/device-mapper/dm-flakey.txt
+++ b/Documentation/device-mapper/dm-flakey.txt
@@ -1,17 +1,53 @@
1dm-flakey 1dm-flakey
2========= 2=========
3 3
4This target is the same as the linear target except that it returns I/O 4This target is the same as the linear target except that it exhibits
5errors periodically. It's been found useful in simulating failing 5unreliable behaviour periodically. It's been found useful in simulating
6devices for testing purposes. 6failing devices for testing purposes.
7 7
8Starting from the time the table is loaded, the device is available for 8Starting from the time the table is loaded, the device is available for
9<up interval> seconds, then returns errors for <down interval> seconds, 9<up interval> seconds, then exhibits unreliable behaviour for <down
10and then this cycle repeats. 10interval> seconds, and then this cycle repeats.
11 11
12Parameters: <dev path> <offset> <up interval> <down interval> 12Also, consider using this in combination with the dm-delay target too,
13which can delay reads and writes and/or send them to different
14underlying devices.
15
16Table parameters
17----------------
18 <dev path> <offset> <up interval> <down interval> \
19 [<num_features> [<feature arguments>]]
20
21Mandatory parameters:
13 <dev path>: Full pathname to the underlying block-device, or a 22 <dev path>: Full pathname to the underlying block-device, or a
14 "major:minor" device-number. 23 "major:minor" device-number.
15 <offset>: Starting sector within the device. 24 <offset>: Starting sector within the device.
16 <up interval>: Number of seconds device is available. 25 <up interval>: Number of seconds device is available.
17 <down interval>: Number of seconds device returns errors. 26 <down interval>: Number of seconds device returns errors.
27
28Optional feature parameters:
29 If no feature parameters are present, during the periods of
30 unreliability, all I/O returns errors.
31
32 drop_writes:
33 All write I/O is silently ignored.
34 Read I/O is handled correctly.
35
36 corrupt_bio_byte <Nth_byte> <direction> <value> <flags>:
37 During <down interval>, replace <Nth_byte> of the data of
38 each matching bio with <value>.
39
40 <Nth_byte>: The offset of the byte to replace.
41 Counting starts at 1, to replace the first byte.
42 <direction>: Either 'r' to corrupt reads or 'w' to corrupt writes.
43 'w' is incompatible with drop_writes.
44 <value>: The value (from 0-255) to write.
45 <flags>: Perform the replacement only if bio->bi_rw has all the
46 selected flags set.
47
48Examples:
49 corrupt_bio_byte 32 r 1 0
50 - replaces the 32nd byte of READ bios with the value 1
51
52 corrupt_bio_byte 224 w 0 32
53 - replaces the 224th byte of REQ_META (=32) bios with the value 0
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 33b6b7071ac..2a8c11331d2 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -1,70 +1,108 @@
1Device-mapper RAID (dm-raid) is a bridge from DM to MD. It 1dm-raid
2provides a way to use device-mapper interfaces to access the MD RAID 2-------
3drivers.
4 3
5As with all device-mapper targets, the nominal public interfaces are the 4The device-mapper RAID (dm-raid) target provides a bridge from DM to MD.
6constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO 5It allows the MD RAID drivers to be accessed using a device-mapper
7and STATUSTYPE_TABLE). The CTR table looks like the following: 6interface.
8 7
91: <s> <l> raid \ 8The target is named "raid" and it accepts the following parameters:
102: <raid_type> <#raid_params> <raid_params> \ 9
113: <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN> 10 <raid_type> <#raid_params> <raid_params> \
12 11 <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>]
13Line 1 contains the standard first three arguments to any device-mapper 12
14target - the start, length, and target type fields. The target type in 13<raid_type>:
15this case is "raid". 14 raid1 RAID1 mirroring
16 15 raid4 RAID4 dedicated parity disk
17Line 2 contains the arguments that define the particular raid 16 raid5_la RAID5 left asymmetric
18type/personality/level, the required arguments for that raid type, and 17 - rotating parity 0 with data continuation
19any optional arguments. Possible raid types include: raid4, raid5_la, 18 raid5_ra RAID5 right asymmetric
20raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc. (raid1 is 19 - rotating parity N with data continuation
21planned for the future.) The list of required and optional parameters 20 raid5_ls RAID5 left symmetric
22is the same for all the current raid types. The required parameters are 21 - rotating parity 0 with data restart
23positional, while the optional parameters are given as key/value pairs. 22 raid5_rs RAID5 right symmetric
24The possible parameters are as follows: 23 - rotating parity N with data restart
25 <chunk_size> Chunk size in sectors. 24 raid6_zr RAID6 zero restart
26 [[no]sync] Force/Prevent RAID initialization 25 - rotating parity zero (left-to-right) with data restart
27 [rebuild <idx>] Rebuild the drive indicated by the index 26 raid6_nr RAID6 N restart
28 [daemon_sleep <ms>] Time between bitmap daemon work to clear bits 27 - rotating parity N (right-to-left) with data restart
29 [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization 28 raid6_nc RAID6 N continue
30 [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization 29 - rotating parity N (right-to-left) with data continuation
31 [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 30
32 [stripe_cache <sectors>] Stripe cache size for higher RAIDs 31 Refererence: Chapter 4 of
33 32 http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
34Line 3 contains the list of devices that compose the array in 33
35metadata/data device pairs. If the metadata is stored separately, a '-' 34<#raid_params>: The number of parameters that follow.
36is given for the metadata device position. If a drive has failed or is 35
37missing at creation time, a '-' can be given for both the metadata and 36<raid_params> consists of
38data drives for a given position. 37 Mandatory parameters:
39 38 <chunk_size>: Chunk size in sectors. This parameter is often known as
40NB. Currently all metadata devices must be specified as '-'. 39 "stripe size". It is the only mandatory parameter and
41 40 is placed first.
42Examples: 41
43# RAID4 - 4 data drives, 1 parity 42 followed by optional parameters (in any order):
43 [sync|nosync] Force or prevent RAID initialization.
44
45 [rebuild <idx>] Rebuild drive number idx (first drive is 0).
46
47 [daemon_sleep <ms>]
48 Interval between runs of the bitmap daemon that
49 clear bits. A longer interval means less bitmap I/O but
50 resyncing after a failure is likely to take longer.
51
52 [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
53 [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
54 [write_mostly <idx>] Drive index is write-mostly
55 [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
56 [stripe_cache <sectors>] Stripe cache size (higher RAIDs only)
57 [region_size <sectors>]
58 The region_size multiplied by the number of regions is the
59 logical size of the array. The bitmap records the device
60 synchronisation state for each region.
61
62<#raid_devs>: The number of devices composing the array.
63 Each device consists of two entries. The first is the device
64 containing the metadata (if any); the second is the one containing the
65 data.
66
67 If a drive has failed or is missing at creation time, a '-' can be
68 given for both the metadata and data drives for a given position.
69
70
71Example tables
72--------------
73# RAID4 - 4 data drives, 1 parity (no metadata devices)
44# No metadata devices specified to hold superblock/bitmap info 74# No metadata devices specified to hold superblock/bitmap info
45# Chunk size of 1MiB 75# Chunk size of 1MiB
46# (Lines separated for easy reading) 76# (Lines separated for easy reading)
77
470 1960893648 raid \ 780 1960893648 raid \
48 raid4 1 2048 \ 79 raid4 1 2048 \
49 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 80 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
50 81
51# RAID4 - 4 data drives, 1 parity (no metadata devices) 82# RAID4 - 4 data drives, 1 parity (with metadata devices)
52# Chunk size of 1MiB, force RAID initialization, 83# Chunk size of 1MiB, force RAID initialization,
53# min recovery rate at 20 kiB/sec/disk 84# min recovery rate at 20 kiB/sec/disk
85
540 1960893648 raid \ 860 1960893648 raid \
55 raid4 4 2048 min_recovery_rate 20 sync\ 87 raid4 4 2048 sync min_recovery_rate 20 \
56 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 88 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82
57 89
58Performing a 'dmsetup table' should display the CTR table used to 90'dmsetup table' displays the table used to construct the mapping.
59construct the mapping (with possible reordering of optional 91The optional parameters are always printed in the order listed
60parameters). 92above with "sync" or "nosync" always output ahead of the other
93arguments, regardless of the order used when originally loading the table.
94Arguments that can be repeated are ordered by value.
61 95
62Performing a 'dmsetup status' will yield information on the state and 96'dmsetup status' yields information on the state and health of the
63health of the array. The output is as follows: 97array.
98The output is as follows:
641: <s> <l> raid \ 991: <s> <l> raid \
652: <raid_type> <#devices> <1 health char for each dev> <resync_ratio> 1002: <raid_type> <#devices> <1 health char for each dev> <resync_ratio>
66 101
67Line 1 is standard DM output. Line 2 is best shown by example: 102Line 1 is the standard output produced by device-mapper.
103Line 2 is produced by the raid target, and best explained by example:
68 0 1960893648 raid raid4 5 AAAAA 2/490221568 104 0 1960893648 raid raid4 5 AAAAA 2/490221568
69Here we can see the RAID type is raid4, there are 5 devices - all of 105Here we can see the RAID type is raid4, there are 5 devices - all of
70which are 'A'live, and the array is 2/490221568 complete with recovery. 106which are 'A'live, and the array is 2/490221568 complete with recovery.
107Faulty or missing devices are marked 'D'. Devices that are out-of-sync
108are marked 'a'.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 8420129fc5e..f75a66e7d31 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -241,12 +241,13 @@ config DM_MIRROR
241 needed for live data migration tools such as 'pvmove'. 241 needed for live data migration tools such as 'pvmove'.
242 242
243config DM_RAID 243config DM_RAID
244 tristate "RAID 4/5/6 target (EXPERIMENTAL)" 244 tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
245 depends on BLK_DEV_DM && EXPERIMENTAL 245 depends on BLK_DEV_DM && EXPERIMENTAL
246 select MD_RAID1
246 select MD_RAID456 247 select MD_RAID456
247 select BLK_DEV_MD 248 select BLK_DEV_MD
248 ---help--- 249 ---help---
249 A dm target that supports RAID4, RAID5 and RAID6 mappings 250 A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings
250 251
251 A RAID-5 set of N drives with a capacity of C MB per drive provides 252 A RAID-5 set of N drives with a capacity of C MB per drive provides
252 the capacity of C * (N - 1) MB, and protects against a failure 253 the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bae6c4e23d3..49da55c1528 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -30,7 +30,6 @@
30#include <linux/device-mapper.h> 30#include <linux/device-mapper.h>
31 31
32#define DM_MSG_PREFIX "crypt" 32#define DM_MSG_PREFIX "crypt"
33#define MESG_STR(x) x, sizeof(x)
34 33
35/* 34/*
36 * context holding the current state of a multi-part conversion 35 * context holding the current state of a multi-part conversion
@@ -239,7 +238,7 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
239 struct dm_crypt_request *dmreq) 238 struct dm_crypt_request *dmreq)
240{ 239{
241 memset(iv, 0, cc->iv_size); 240 memset(iv, 0, cc->iv_size);
242 *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); 241 *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
243 242
244 return 0; 243 return 0;
245} 244}
@@ -248,7 +247,7 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
248 struct dm_crypt_request *dmreq) 247 struct dm_crypt_request *dmreq)
249{ 248{
250 memset(iv, 0, cc->iv_size); 249 memset(iv, 0, cc->iv_size);
251 *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); 250 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
252 251
253 return 0; 252 return 0;
254} 253}
@@ -415,7 +414,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
415 struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; 414 struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
416 415
417 memset(iv, 0, cc->iv_size); 416 memset(iv, 0, cc->iv_size);
418 *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); 417 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
419 crypto_cipher_encrypt_one(essiv_tfm, iv, iv); 418 crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
420 419
421 return 0; 420 return 0;
@@ -1575,11 +1574,17 @@ bad_mem:
1575static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) 1574static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1576{ 1575{
1577 struct crypt_config *cc; 1576 struct crypt_config *cc;
1578 unsigned int key_size; 1577 unsigned int key_size, opt_params;
1579 unsigned long long tmpll; 1578 unsigned long long tmpll;
1580 int ret; 1579 int ret;
1580 struct dm_arg_set as;
1581 const char *opt_string;
1582
1583 static struct dm_arg _args[] = {
1584 {0, 1, "Invalid number of feature args"},
1585 };
1581 1586
1582 if (argc != 5) { 1587 if (argc < 5) {
1583 ti->error = "Not enough arguments"; 1588 ti->error = "Not enough arguments";
1584 return -EINVAL; 1589 return -EINVAL;
1585 } 1590 }
@@ -1648,6 +1653,30 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1648 } 1653 }
1649 cc->start = tmpll; 1654 cc->start = tmpll;
1650 1655
1656 argv += 5;
1657 argc -= 5;
1658
1659 /* Optional parameters */
1660 if (argc) {
1661 as.argc = argc;
1662 as.argv = argv;
1663
1664 ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
1665 if (ret)
1666 goto bad;
1667
1668 opt_string = dm_shift_arg(&as);
1669
1670 if (opt_params == 1 && opt_string &&
1671 !strcasecmp(opt_string, "allow_discards"))
1672 ti->num_discard_requests = 1;
1673 else if (opt_params) {
1674 ret = -EINVAL;
1675 ti->error = "Invalid feature arguments";
1676 goto bad;
1677 }
1678 }
1679
1651 ret = -ENOMEM; 1680 ret = -ENOMEM;
1652 cc->io_queue = alloc_workqueue("kcryptd_io", 1681 cc->io_queue = alloc_workqueue("kcryptd_io",
1653 WQ_NON_REENTRANT| 1682 WQ_NON_REENTRANT|
@@ -1682,9 +1711,16 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1682 struct dm_crypt_io *io; 1711 struct dm_crypt_io *io;
1683 struct crypt_config *cc; 1712 struct crypt_config *cc;
1684 1713
1685 if (bio->bi_rw & REQ_FLUSH) { 1714 /*
1715 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
1716 * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight
1717 * - for REQ_DISCARD caller must use flush if IO ordering matters
1718 */
1719 if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
1686 cc = ti->private; 1720 cc = ti->private;
1687 bio->bi_bdev = cc->dev->bdev; 1721 bio->bi_bdev = cc->dev->bdev;
1722 if (bio_sectors(bio))
1723 bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
1688 return DM_MAPIO_REMAPPED; 1724 return DM_MAPIO_REMAPPED;
1689 } 1725 }
1690 1726
@@ -1727,6 +1763,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
1727 1763
1728 DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, 1764 DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
1729 cc->dev->name, (unsigned long long)cc->start); 1765 cc->dev->name, (unsigned long long)cc->start);
1766
1767 if (ti->num_discard_requests)
1768 DMEMIT(" 1 allow_discards");
1769
1730 break; 1770 break;
1731 } 1771 }
1732 return 0; 1772 return 0;
@@ -1770,12 +1810,12 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1770 if (argc < 2) 1810 if (argc < 2)
1771 goto error; 1811 goto error;
1772 1812
1773 if (!strnicmp(argv[0], MESG_STR("key"))) { 1813 if (!strcasecmp(argv[0], "key")) {
1774 if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { 1814 if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) {
1775 DMWARN("not suspended during key manipulation."); 1815 DMWARN("not suspended during key manipulation.");
1776 return -EINVAL; 1816 return -EINVAL;
1777 } 1817 }
1778 if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) { 1818 if (argc == 3 && !strcasecmp(argv[1], "set")) {
1779 ret = crypt_set_key(cc, argv[2]); 1819 ret = crypt_set_key(cc, argv[2]);
1780 if (ret) 1820 if (ret)
1781 return ret; 1821 return ret;
@@ -1783,7 +1823,7 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1783 ret = cc->iv_gen_ops->init(cc); 1823 ret = cc->iv_gen_ops->init(cc);
1784 return ret; 1824 return ret;
1785 } 1825 }
1786 if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) { 1826 if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
1787 if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { 1827 if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
1788 ret = cc->iv_gen_ops->wipe(cc); 1828 ret = cc->iv_gen_ops->wipe(cc);
1789 if (ret) 1829 if (ret)
@@ -1823,7 +1863,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
1823 1863
1824static struct target_type crypt_target = { 1864static struct target_type crypt_target = {
1825 .name = "crypt", 1865 .name = "crypt",
1826 .version = {1, 10, 0}, 1866 .version = {1, 11, 0},
1827 .module = THIS_MODULE, 1867 .module = THIS_MODULE,
1828 .ctr = crypt_ctr, 1868 .ctr = crypt_ctr,
1829 .dtr = crypt_dtr, 1869 .dtr = crypt_dtr,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index ea790623c30..89f73ca22cf 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2003 Sistina Software (UK) Limited. 2 * Copyright (C) 2003 Sistina Software (UK) Limited.
3 * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -15,6 +15,9 @@
15 15
16#define DM_MSG_PREFIX "flakey" 16#define DM_MSG_PREFIX "flakey"
17 17
18#define all_corrupt_bio_flags_match(bio, fc) \
19 (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags)
20
18/* 21/*
19 * Flakey: Used for testing only, simulates intermittent, 22 * Flakey: Used for testing only, simulates intermittent,
20 * catastrophic device failure. 23 * catastrophic device failure.
@@ -25,60 +28,189 @@ struct flakey_c {
25 sector_t start; 28 sector_t start;
26 unsigned up_interval; 29 unsigned up_interval;
27 unsigned down_interval; 30 unsigned down_interval;
31 unsigned long flags;
32 unsigned corrupt_bio_byte;
33 unsigned corrupt_bio_rw;
34 unsigned corrupt_bio_value;
35 unsigned corrupt_bio_flags;
36};
37
38enum feature_flag_bits {
39 DROP_WRITES
28}; 40};
29 41
42static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
43 struct dm_target *ti)
44{
45 int r;
46 unsigned argc;
47 const char *arg_name;
48
49 static struct dm_arg _args[] = {
50 {0, 6, "Invalid number of feature args"},
51 {1, UINT_MAX, "Invalid corrupt bio byte"},
52 {0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
53 {0, UINT_MAX, "Invalid corrupt bio flags mask"},
54 };
55
56 /* No feature arguments supplied. */
57 if (!as->argc)
58 return 0;
59
60 r = dm_read_arg_group(_args, as, &argc, &ti->error);
61 if (r)
62 return r;
63
64 while (argc) {
65 arg_name = dm_shift_arg(as);
66 argc--;
67
68 /*
69 * drop_writes
70 */
71 if (!strcasecmp(arg_name, "drop_writes")) {
72 if (test_and_set_bit(DROP_WRITES, &fc->flags)) {
73 ti->error = "Feature drop_writes duplicated";
74 return -EINVAL;
75 }
76
77 continue;
78 }
79
80 /*
81 * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>
82 */
83 if (!strcasecmp(arg_name, "corrupt_bio_byte")) {
84 if (!argc)
85 ti->error = "Feature corrupt_bio_byte requires parameters";
86
87 r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error);
88 if (r)
89 return r;
90 argc--;
91
92 /*
93 * Direction r or w?
94 */
95 arg_name = dm_shift_arg(as);
96 if (!strcasecmp(arg_name, "w"))
97 fc->corrupt_bio_rw = WRITE;
98 else if (!strcasecmp(arg_name, "r"))
99 fc->corrupt_bio_rw = READ;
100 else {
101 ti->error = "Invalid corrupt bio direction (r or w)";
102 return -EINVAL;
103 }
104 argc--;
105
106 /*
107 * Value of byte (0-255) to write in place of correct one.
108 */
109 r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error);
110 if (r)
111 return r;
112 argc--;
113
114 /*
115 * Only corrupt bios with these flags set.
116 */
117 r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error);
118 if (r)
119 return r;
120 argc--;
121
122 continue;
123 }
124
125 ti->error = "Unrecognised flakey feature requested";
126 return -EINVAL;
127 }
128
129 if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
130 ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
131 return -EINVAL;
132 }
133
134 return 0;
135}
136
30/* 137/*
31 * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval> 138 * Construct a flakey mapping:
139 * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*]
140 *
141 * Feature args:
142 * [drop_writes]
143 * [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>]
144 *
145 * Nth_byte starts from 1 for the first byte.
146 * Direction is r for READ or w for WRITE.
147 * bio_flags is ignored if 0.
32 */ 148 */
33static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) 149static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
34{ 150{
151 static struct dm_arg _args[] = {
152 {0, UINT_MAX, "Invalid up interval"},
153 {0, UINT_MAX, "Invalid down interval"},
154 };
155
156 int r;
35 struct flakey_c *fc; 157 struct flakey_c *fc;
36 unsigned long long tmp; 158 unsigned long long tmpll;
159 struct dm_arg_set as;
160 const char *devname;
37 161
38 if (argc != 4) { 162 as.argc = argc;
39 ti->error = "dm-flakey: Invalid argument count"; 163 as.argv = argv;
164
165 if (argc < 4) {
166 ti->error = "Invalid argument count";
40 return -EINVAL; 167 return -EINVAL;
41 } 168 }
42 169
43 fc = kmalloc(sizeof(*fc), GFP_KERNEL); 170 fc = kzalloc(sizeof(*fc), GFP_KERNEL);
44 if (!fc) { 171 if (!fc) {
45 ti->error = "dm-flakey: Cannot allocate linear context"; 172 ti->error = "Cannot allocate linear context";
46 return -ENOMEM; 173 return -ENOMEM;
47 } 174 }
48 fc->start_time = jiffies; 175 fc->start_time = jiffies;
49 176
50 if (sscanf(argv[1], "%llu", &tmp) != 1) { 177 devname = dm_shift_arg(&as);
51 ti->error = "dm-flakey: Invalid device sector"; 178
179 if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) {
180 ti->error = "Invalid device sector";
52 goto bad; 181 goto bad;
53 } 182 }
54 fc->start = tmp; 183 fc->start = tmpll;
55 184
56 if (sscanf(argv[2], "%u", &fc->up_interval) != 1) { 185 r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error);
57 ti->error = "dm-flakey: Invalid up interval"; 186 if (r)
58 goto bad; 187 goto bad;
59 }
60 188
61 if (sscanf(argv[3], "%u", &fc->down_interval) != 1) { 189 r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error);
62 ti->error = "dm-flakey: Invalid down interval"; 190 if (r)
63 goto bad; 191 goto bad;
64 }
65 192
66 if (!(fc->up_interval + fc->down_interval)) { 193 if (!(fc->up_interval + fc->down_interval)) {
67 ti->error = "dm-flakey: Total (up + down) interval is zero"; 194 ti->error = "Total (up + down) interval is zero";
68 goto bad; 195 goto bad;
69 } 196 }
70 197
71 if (fc->up_interval + fc->down_interval < fc->up_interval) { 198 if (fc->up_interval + fc->down_interval < fc->up_interval) {
72 ti->error = "dm-flakey: Interval overflow"; 199 ti->error = "Interval overflow";
73 goto bad; 200 goto bad;
74 } 201 }
75 202
76 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) { 203 r = parse_features(&as, fc, ti);
77 ti->error = "dm-flakey: Device lookup failed"; 204 if (r)
205 goto bad;
206
207 if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) {
208 ti->error = "Device lookup failed";
78 goto bad; 209 goto bad;
79 } 210 }
80 211
81 ti->num_flush_requests = 1; 212 ti->num_flush_requests = 1;
213 ti->num_discard_requests = 1;
82 ti->private = fc; 214 ti->private = fc;
83 return 0; 215 return 0;
84 216
@@ -99,7 +231,7 @@ static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector)
99{ 231{
100 struct flakey_c *fc = ti->private; 232 struct flakey_c *fc = ti->private;
101 233
102 return fc->start + (bi_sector - ti->begin); 234 return fc->start + dm_target_offset(ti, bi_sector);
103} 235}
104 236
105static void flakey_map_bio(struct dm_target *ti, struct bio *bio) 237static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
@@ -111,6 +243,25 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
111 bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); 243 bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
112} 244}
113 245
246static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
247{
248 unsigned bio_bytes = bio_cur_bytes(bio);
249 char *data = bio_data(bio);
250
251 /*
252 * Overwrite the Nth byte of the data returned.
253 */
254 if (data && bio_bytes >= fc->corrupt_bio_byte) {
255 data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
256
257 DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
258 "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
259 bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
260 (bio_data_dir(bio) == WRITE) ? 'w' : 'r',
261 bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes);
262 }
263}
264
114static int flakey_map(struct dm_target *ti, struct bio *bio, 265static int flakey_map(struct dm_target *ti, struct bio *bio,
115 union map_info *map_context) 266 union map_info *map_context)
116{ 267{
@@ -119,18 +270,71 @@ static int flakey_map(struct dm_target *ti, struct bio *bio,
119 270
120 /* Are we alive ? */ 271 /* Are we alive ? */
121 elapsed = (jiffies - fc->start_time) / HZ; 272 elapsed = (jiffies - fc->start_time) / HZ;
122 if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) 273 if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
274 /*
275 * Flag this bio as submitted while down.
276 */
277 map_context->ll = 1;
278
279 /*
280 * Map reads as normal.
281 */
282 if (bio_data_dir(bio) == READ)
283 goto map_bio;
284
285 /*
286 * Drop writes?
287 */
288 if (test_bit(DROP_WRITES, &fc->flags)) {
289 bio_endio(bio, 0);
290 return DM_MAPIO_SUBMITTED;
291 }
292
293 /*
294 * Corrupt matching writes.
295 */
296 if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) {
297 if (all_corrupt_bio_flags_match(bio, fc))
298 corrupt_bio_data(bio, fc);
299 goto map_bio;
300 }
301
302 /*
303 * By default, error all I/O.
304 */
123 return -EIO; 305 return -EIO;
306 }
124 307
308map_bio:
125 flakey_map_bio(ti, bio); 309 flakey_map_bio(ti, bio);
126 310
127 return DM_MAPIO_REMAPPED; 311 return DM_MAPIO_REMAPPED;
128} 312}
129 313
314static int flakey_end_io(struct dm_target *ti, struct bio *bio,
315 int error, union map_info *map_context)
316{
317 struct flakey_c *fc = ti->private;
318 unsigned bio_submitted_while_down = map_context->ll;
319
320 /*
321 * Corrupt successful READs while in down state.
322 * If flags were specified, only corrupt those that match.
323 */
324 if (!error && bio_submitted_while_down &&
325 (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
326 all_corrupt_bio_flags_match(bio, fc))
327 corrupt_bio_data(bio, fc);
328
329 return error;
330}
331
130static int flakey_status(struct dm_target *ti, status_type_t type, 332static int flakey_status(struct dm_target *ti, status_type_t type,
131 char *result, unsigned int maxlen) 333 char *result, unsigned int maxlen)
132{ 334{
335 unsigned sz = 0;
133 struct flakey_c *fc = ti->private; 336 struct flakey_c *fc = ti->private;
337 unsigned drop_writes;
134 338
135 switch (type) { 339 switch (type) {
136 case STATUSTYPE_INFO: 340 case STATUSTYPE_INFO:
@@ -138,9 +342,22 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
138 break; 342 break;
139 343
140 case STATUSTYPE_TABLE: 344 case STATUSTYPE_TABLE:
141 snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name, 345 DMEMIT("%s %llu %u %u ", fc->dev->name,
142 (unsigned long long)fc->start, fc->up_interval, 346 (unsigned long long)fc->start, fc->up_interval,
143 fc->down_interval); 347 fc->down_interval);
348
349 drop_writes = test_bit(DROP_WRITES, &fc->flags);
350 DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5);
351
352 if (drop_writes)
353 DMEMIT("drop_writes ");
354
355 if (fc->corrupt_bio_byte)
356 DMEMIT("corrupt_bio_byte %u %c %u %u ",
357 fc->corrupt_bio_byte,
358 (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r',
359 fc->corrupt_bio_value, fc->corrupt_bio_flags);
360
144 break; 361 break;
145 } 362 }
146 return 0; 363 return 0;
@@ -177,11 +394,12 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
177 394
178static struct target_type flakey_target = { 395static struct target_type flakey_target = {
179 .name = "flakey", 396 .name = "flakey",
180 .version = {1, 1, 0}, 397 .version = {1, 2, 0},
181 .module = THIS_MODULE, 398 .module = THIS_MODULE,
182 .ctr = flakey_ctr, 399 .ctr = flakey_ctr,
183 .dtr = flakey_dtr, 400 .dtr = flakey_dtr,
184 .map = flakey_map, 401 .map = flakey_map,
402 .end_io = flakey_end_io,
185 .status = flakey_status, 403 .status = flakey_status,
186 .ioctl = flakey_ioctl, 404 .ioctl = flakey_ioctl,
187 .merge = flakey_merge, 405 .merge = flakey_merge,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2067288f61f..ad2eba40e31 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -38,6 +38,8 @@ struct io {
38 struct dm_io_client *client; 38 struct dm_io_client *client;
39 io_notify_fn callback; 39 io_notify_fn callback;
40 void *context; 40 void *context;
41 void *vma_invalidate_address;
42 unsigned long vma_invalidate_size;
41} __attribute__((aligned(DM_IO_MAX_REGIONS))); 43} __attribute__((aligned(DM_IO_MAX_REGIONS)));
42 44
43static struct kmem_cache *_dm_io_cache; 45static struct kmem_cache *_dm_io_cache;
@@ -116,6 +118,10 @@ static void dec_count(struct io *io, unsigned int region, int error)
116 set_bit(region, &io->error_bits); 118 set_bit(region, &io->error_bits);
117 119
118 if (atomic_dec_and_test(&io->count)) { 120 if (atomic_dec_and_test(&io->count)) {
121 if (io->vma_invalidate_size)
122 invalidate_kernel_vmap_range(io->vma_invalidate_address,
123 io->vma_invalidate_size);
124
119 if (io->sleeper) 125 if (io->sleeper)
120 wake_up_process(io->sleeper); 126 wake_up_process(io->sleeper);
121 127
@@ -159,6 +165,9 @@ struct dpages {
159 165
160 unsigned context_u; 166 unsigned context_u;
161 void *context_ptr; 167 void *context_ptr;
168
169 void *vma_invalidate_address;
170 unsigned long vma_invalidate_size;
162}; 171};
163 172
164/* 173/*
@@ -377,6 +386,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
377 io->sleeper = current; 386 io->sleeper = current;
378 io->client = client; 387 io->client = client;
379 388
389 io->vma_invalidate_address = dp->vma_invalidate_address;
390 io->vma_invalidate_size = dp->vma_invalidate_size;
391
380 dispatch_io(rw, num_regions, where, dp, io, 1); 392 dispatch_io(rw, num_regions, where, dp, io, 1);
381 393
382 while (1) { 394 while (1) {
@@ -415,13 +427,21 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
415 io->callback = fn; 427 io->callback = fn;
416 io->context = context; 428 io->context = context;
417 429
430 io->vma_invalidate_address = dp->vma_invalidate_address;
431 io->vma_invalidate_size = dp->vma_invalidate_size;
432
418 dispatch_io(rw, num_regions, where, dp, io, 0); 433 dispatch_io(rw, num_regions, where, dp, io, 0);
419 return 0; 434 return 0;
420} 435}
421 436
422static int dp_init(struct dm_io_request *io_req, struct dpages *dp) 437static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
438 unsigned long size)
423{ 439{
424 /* Set up dpages based on memory type */ 440 /* Set up dpages based on memory type */
441
442 dp->vma_invalidate_address = NULL;
443 dp->vma_invalidate_size = 0;
444
425 switch (io_req->mem.type) { 445 switch (io_req->mem.type) {
426 case DM_IO_PAGE_LIST: 446 case DM_IO_PAGE_LIST:
427 list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); 447 list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
@@ -432,6 +452,11 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
432 break; 452 break;
433 453
434 case DM_IO_VMA: 454 case DM_IO_VMA:
455 flush_kernel_vmap_range(io_req->mem.ptr.vma, size);
456 if ((io_req->bi_rw & RW_MASK) == READ) {
457 dp->vma_invalidate_address = io_req->mem.ptr.vma;
458 dp->vma_invalidate_size = size;
459 }
435 vm_dp_init(dp, io_req->mem.ptr.vma); 460 vm_dp_init(dp, io_req->mem.ptr.vma);
436 break; 461 break;
437 462
@@ -460,7 +485,7 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
460 int r; 485 int r;
461 struct dpages dp; 486 struct dpages dp;
462 487
463 r = dp_init(io_req, &dp); 488 r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT);
464 if (r) 489 if (r)
465 return r; 490 return r;
466 491
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4cacdad2270..2e9a3ca37bd 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -128,6 +128,24 @@ static struct hash_cell *__get_uuid_cell(const char *str)
128 return NULL; 128 return NULL;
129} 129}
130 130
131static struct hash_cell *__get_dev_cell(uint64_t dev)
132{
133 struct mapped_device *md;
134 struct hash_cell *hc;
135
136 md = dm_get_md(huge_decode_dev(dev));
137 if (!md)
138 return NULL;
139
140 hc = dm_get_mdptr(md);
141 if (!hc) {
142 dm_put(md);
143 return NULL;
144 }
145
146 return hc;
147}
148
131/*----------------------------------------------------------------- 149/*-----------------------------------------------------------------
132 * Inserting, removing and renaming a device. 150 * Inserting, removing and renaming a device.
133 *---------------------------------------------------------------*/ 151 *---------------------------------------------------------------*/
@@ -718,25 +736,45 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
718 */ 736 */
719static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) 737static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
720{ 738{
721 struct mapped_device *md; 739 struct hash_cell *hc = NULL;
722 void *mdptr = NULL;
723 740
724 if (*param->uuid) 741 if (*param->uuid) {
725 return __get_uuid_cell(param->uuid); 742 if (*param->name || param->dev)
743 return NULL;
726 744
727 if (*param->name) 745 hc = __get_uuid_cell(param->uuid);
728 return __get_name_cell(param->name); 746 if (!hc)
747 return NULL;
748 } else if (*param->name) {
749 if (param->dev)
750 return NULL;
729 751
730 md = dm_get_md(huge_decode_dev(param->dev)); 752 hc = __get_name_cell(param->name);
731 if (!md) 753 if (!hc)
732 goto out; 754 return NULL;
755 } else if (param->dev) {
756 hc = __get_dev_cell(param->dev);
757 if (!hc)
758 return NULL;
759 } else
760 return NULL;
733 761
734 mdptr = dm_get_mdptr(md); 762 /*
735 if (!mdptr) 763 * Sneakily write in both the name and the uuid
736 dm_put(md); 764 * while we have the cell.
765 */
766 strlcpy(param->name, hc->name, sizeof(param->name));
767 if (hc->uuid)
768 strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
769 else
770 param->uuid[0] = '\0';
737 771
738out: 772 if (hc->new_map)
739 return mdptr; 773 param->flags |= DM_INACTIVE_PRESENT_FLAG;
774 else
775 param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
776
777 return hc;
740} 778}
741 779
742static struct mapped_device *find_device(struct dm_ioctl *param) 780static struct mapped_device *find_device(struct dm_ioctl *param)
@@ -746,24 +784,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
746 784
747 down_read(&_hash_lock); 785 down_read(&_hash_lock);
748 hc = __find_device_hash_cell(param); 786 hc = __find_device_hash_cell(param);
749 if (hc) { 787 if (hc)
750 md = hc->md; 788 md = hc->md;
751
752 /*
753 * Sneakily write in both the name and the uuid
754 * while we have the cell.
755 */
756 strlcpy(param->name, hc->name, sizeof(param->name));
757 if (hc->uuid)
758 strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
759 else
760 param->uuid[0] = '\0';
761
762 if (hc->new_map)
763 param->flags |= DM_INACTIVE_PRESENT_FLAG;
764 else
765 param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
766 }
767 up_read(&_hash_lock); 789 up_read(&_hash_lock);
768 790
769 return md; 791 return md;
@@ -1402,6 +1424,11 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1402 goto out; 1424 goto out;
1403 } 1425 }
1404 1426
1427 if (!argc) {
1428 DMWARN("Empty message received.");
1429 goto out;
1430 }
1431
1405 table = dm_get_live_table(md); 1432 table = dm_get_live_table(md);
1406 if (!table) 1433 if (!table)
1407 goto out_argv; 1434 goto out_argv;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 320401dec10..f8214702963 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -224,8 +224,6 @@ struct kcopyd_job {
224 unsigned int num_dests; 224 unsigned int num_dests;
225 struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; 225 struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
226 226
227 sector_t offset;
228 unsigned int nr_pages;
229 struct page_list *pages; 227 struct page_list *pages;
230 228
231 /* 229 /*
@@ -380,7 +378,7 @@ static int run_io_job(struct kcopyd_job *job)
380 .bi_rw = job->rw, 378 .bi_rw = job->rw,
381 .mem.type = DM_IO_PAGE_LIST, 379 .mem.type = DM_IO_PAGE_LIST,
382 .mem.ptr.pl = job->pages, 380 .mem.ptr.pl = job->pages,
383 .mem.offset = job->offset, 381 .mem.offset = 0,
384 .notify.fn = complete_io, 382 .notify.fn = complete_io,
385 .notify.context = job, 383 .notify.context = job,
386 .client = job->kc->io_client, 384 .client = job->kc->io_client,
@@ -397,10 +395,9 @@ static int run_io_job(struct kcopyd_job *job)
397static int run_pages_job(struct kcopyd_job *job) 395static int run_pages_job(struct kcopyd_job *job)
398{ 396{
399 int r; 397 int r;
398 unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);
400 399
401 job->nr_pages = dm_div_up(job->dests[0].count + job->offset, 400 r = kcopyd_get_pages(job->kc, nr_pages, &job->pages);
402 PAGE_SIZE >> 9);
403 r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
404 if (!r) { 401 if (!r) {
405 /* this job is ready for io */ 402 /* this job is ready for io */
406 push(&job->kc->io_jobs, job); 403 push(&job->kc->io_jobs, job);
@@ -602,8 +599,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
602 job->num_dests = num_dests; 599 job->num_dests = num_dests;
603 memcpy(&job->dests, dests, sizeof(*dests) * num_dests); 600 memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
604 601
605 job->offset = 0;
606 job->nr_pages = 0;
607 job->pages = NULL; 602 job->pages = NULL;
608 603
609 job->fn = fn; 604 job->fn = fn;
@@ -622,6 +617,37 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
622} 617}
623EXPORT_SYMBOL(dm_kcopyd_copy); 618EXPORT_SYMBOL(dm_kcopyd_copy);
624 619
620void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
621 dm_kcopyd_notify_fn fn, void *context)
622{
623 struct kcopyd_job *job;
624
625 job = mempool_alloc(kc->job_pool, GFP_NOIO);
626
627 memset(job, 0, sizeof(struct kcopyd_job));
628 job->kc = kc;
629 job->fn = fn;
630 job->context = context;
631
632 atomic_inc(&kc->nr_jobs);
633
634 return job;
635}
636EXPORT_SYMBOL(dm_kcopyd_prepare_callback);
637
638void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
639{
640 struct kcopyd_job *job = j;
641 struct dm_kcopyd_client *kc = job->kc;
642
643 job->read_err = read_err;
644 job->write_err = write_err;
645
646 push(&kc->complete_jobs, job);
647 wake(kc);
648}
649EXPORT_SYMBOL(dm_kcopyd_do_callback);
650
625/* 651/*
626 * Cancels a kcopyd job, eg. someone might be deactivating a 652 * Cancels a kcopyd job, eg. someone might be deactivating a
627 * mirror. 653 * mirror.
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index aa2e0c374ab..1021c898601 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -394,8 +394,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
394 group[count] = fe->region; 394 group[count] = fe->region;
395 count++; 395 count++;
396 396
397 list_del(&fe->list); 397 list_move(&fe->list, &tmp_list);
398 list_add(&fe->list, &tmp_list);
399 398
400 type = fe->type; 399 type = fe->type;
401 if (count >= MAX_FLUSH_GROUP_COUNT) 400 if (count >= MAX_FLUSH_GROUP_COUNT)
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 948e3f4925b..3b52bb72bd1 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -197,15 +197,21 @@ EXPORT_SYMBOL(dm_dirty_log_destroy);
197#define MIRROR_DISK_VERSION 2 197#define MIRROR_DISK_VERSION 2
198#define LOG_OFFSET 2 198#define LOG_OFFSET 2
199 199
200struct log_header { 200struct log_header_disk {
201 uint32_t magic; 201 __le32 magic;
202 202
203 /* 203 /*
204 * Simple, incrementing version. no backward 204 * Simple, incrementing version. no backward
205 * compatibility. 205 * compatibility.
206 */ 206 */
207 __le32 version;
208 __le64 nr_regions;
209} __packed;
210
211struct log_header_core {
212 uint32_t magic;
207 uint32_t version; 213 uint32_t version;
208 sector_t nr_regions; 214 uint64_t nr_regions;
209}; 215};
210 216
211struct log_c { 217struct log_c {
@@ -239,10 +245,10 @@ struct log_c {
239 int log_dev_failed; 245 int log_dev_failed;
240 int log_dev_flush_failed; 246 int log_dev_flush_failed;
241 struct dm_dev *log_dev; 247 struct dm_dev *log_dev;
242 struct log_header header; 248 struct log_header_core header;
243 249
244 struct dm_io_region header_location; 250 struct dm_io_region header_location;
245 struct log_header *disk_header; 251 struct log_header_disk *disk_header;
246}; 252};
247 253
248/* 254/*
@@ -251,34 +257,34 @@ struct log_c {
251 */ 257 */
252static inline int log_test_bit(uint32_t *bs, unsigned bit) 258static inline int log_test_bit(uint32_t *bs, unsigned bit)
253{ 259{
254 return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0; 260 return test_bit_le(bit, bs) ? 1 : 0;
255} 261}
256 262
257static inline void log_set_bit(struct log_c *l, 263static inline void log_set_bit(struct log_c *l,
258 uint32_t *bs, unsigned bit) 264 uint32_t *bs, unsigned bit)
259{ 265{
260 __test_and_set_bit_le(bit, (unsigned long *) bs); 266 __set_bit_le(bit, bs);
261 l->touched_cleaned = 1; 267 l->touched_cleaned = 1;
262} 268}
263 269
264static inline void log_clear_bit(struct log_c *l, 270static inline void log_clear_bit(struct log_c *l,
265 uint32_t *bs, unsigned bit) 271 uint32_t *bs, unsigned bit)
266{ 272{
267 __test_and_clear_bit_le(bit, (unsigned long *) bs); 273 __clear_bit_le(bit, bs);
268 l->touched_dirtied = 1; 274 l->touched_dirtied = 1;
269} 275}
270 276
271/*---------------------------------------------------------------- 277/*----------------------------------------------------------------
272 * Header IO 278 * Header IO
273 *--------------------------------------------------------------*/ 279 *--------------------------------------------------------------*/
274static void header_to_disk(struct log_header *core, struct log_header *disk) 280static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk)
275{ 281{
276 disk->magic = cpu_to_le32(core->magic); 282 disk->magic = cpu_to_le32(core->magic);
277 disk->version = cpu_to_le32(core->version); 283 disk->version = cpu_to_le32(core->version);
278 disk->nr_regions = cpu_to_le64(core->nr_regions); 284 disk->nr_regions = cpu_to_le64(core->nr_regions);
279} 285}
280 286
281static void header_from_disk(struct log_header *core, struct log_header *disk) 287static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk)
282{ 288{
283 core->magic = le32_to_cpu(disk->magic); 289 core->magic = le32_to_cpu(disk->magic);
284 core->version = le32_to_cpu(disk->version); 290 core->version = le32_to_cpu(disk->version);
@@ -486,7 +492,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
486 memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); 492 memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
487 lc->sync_count = (sync == NOSYNC) ? region_count : 0; 493 lc->sync_count = (sync == NOSYNC) ? region_count : 0;
488 494
489 lc->recovering_bits = vmalloc(bitset_size); 495 lc->recovering_bits = vzalloc(bitset_size);
490 if (!lc->recovering_bits) { 496 if (!lc->recovering_bits) {
491 DMWARN("couldn't allocate sync bitset"); 497 DMWARN("couldn't allocate sync bitset");
492 vfree(lc->sync_bits); 498 vfree(lc->sync_bits);
@@ -498,7 +504,6 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
498 kfree(lc); 504 kfree(lc);
499 return -ENOMEM; 505 return -ENOMEM;
500 } 506 }
501 memset(lc->recovering_bits, 0, bitset_size);
502 lc->sync_search = 0; 507 lc->sync_search = 0;
503 log->context = lc; 508 log->context = lc;
504 509
@@ -739,8 +744,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
739 return 0; 744 return 0;
740 745
741 do { 746 do {
742 *region = find_next_zero_bit_le( 747 *region = find_next_zero_bit_le(lc->sync_bits,
743 (unsigned long *) lc->sync_bits,
744 lc->region_count, 748 lc->region_count,
745 lc->sync_search); 749 lc->sync_search);
746 lc->sync_search = *region + 1; 750 lc->sync_search = *region + 1;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c3547016f0f..5e0090ef418 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -22,7 +22,6 @@
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23 23
24#define DM_MSG_PREFIX "multipath" 24#define DM_MSG_PREFIX "multipath"
25#define MESG_STR(x) x, sizeof(x)
26#define DM_PG_INIT_DELAY_MSECS 2000 25#define DM_PG_INIT_DELAY_MSECS 2000
27#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) 26#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
28 27
@@ -505,80 +504,29 @@ static void trigger_event(struct work_struct *work)
505 * <#paths> <#per-path selector args> 504 * <#paths> <#per-path selector args>
506 * [<path> [<arg>]* ]+ ]+ 505 * [<path> [<arg>]* ]+ ]+
507 *---------------------------------------------------------------*/ 506 *---------------------------------------------------------------*/
508struct param { 507static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
509 unsigned min;
510 unsigned max;
511 char *error;
512};
513
514static int read_param(struct param *param, char *str, unsigned *v, char **error)
515{
516 if (!str ||
517 (sscanf(str, "%u", v) != 1) ||
518 (*v < param->min) ||
519 (*v > param->max)) {
520 *error = param->error;
521 return -EINVAL;
522 }
523
524 return 0;
525}
526
527struct arg_set {
528 unsigned argc;
529 char **argv;
530};
531
532static char *shift(struct arg_set *as)
533{
534 char *r;
535
536 if (as->argc) {
537 as->argc--;
538 r = *as->argv;
539 as->argv++;
540 return r;
541 }
542
543 return NULL;
544}
545
546static void consume(struct arg_set *as, unsigned n)
547{
548 BUG_ON (as->argc < n);
549 as->argc -= n;
550 as->argv += n;
551}
552
553static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
554 struct dm_target *ti) 508 struct dm_target *ti)
555{ 509{
556 int r; 510 int r;
557 struct path_selector_type *pst; 511 struct path_selector_type *pst;
558 unsigned ps_argc; 512 unsigned ps_argc;
559 513
560 static struct param _params[] = { 514 static struct dm_arg _args[] = {
561 {0, 1024, "invalid number of path selector args"}, 515 {0, 1024, "invalid number of path selector args"},
562 }; 516 };
563 517
564 pst = dm_get_path_selector(shift(as)); 518 pst = dm_get_path_selector(dm_shift_arg(as));
565 if (!pst) { 519 if (!pst) {
566 ti->error = "unknown path selector type"; 520 ti->error = "unknown path selector type";
567 return -EINVAL; 521 return -EINVAL;
568 } 522 }
569 523
570 r = read_param(_params, shift(as), &ps_argc, &ti->error); 524 r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
571 if (r) { 525 if (r) {
572 dm_put_path_selector(pst); 526 dm_put_path_selector(pst);
573 return -EINVAL; 527 return -EINVAL;
574 } 528 }
575 529
576 if (ps_argc > as->argc) {
577 dm_put_path_selector(pst);
578 ti->error = "not enough arguments for path selector";
579 return -EINVAL;
580 }
581
582 r = pst->create(&pg->ps, ps_argc, as->argv); 530 r = pst->create(&pg->ps, ps_argc, as->argv);
583 if (r) { 531 if (r) {
584 dm_put_path_selector(pst); 532 dm_put_path_selector(pst);
@@ -587,12 +535,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
587 } 535 }
588 536
589 pg->ps.type = pst; 537 pg->ps.type = pst;
590 consume(as, ps_argc); 538 dm_consume_args(as, ps_argc);
591 539
592 return 0; 540 return 0;
593} 541}
594 542
595static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, 543static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
596 struct dm_target *ti) 544 struct dm_target *ti)
597{ 545{
598 int r; 546 int r;
@@ -609,7 +557,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
609 if (!p) 557 if (!p)
610 return ERR_PTR(-ENOMEM); 558 return ERR_PTR(-ENOMEM);
611 559
612 r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table), 560 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
613 &p->path.dev); 561 &p->path.dev);
614 if (r) { 562 if (r) {
615 ti->error = "error getting device"; 563 ti->error = "error getting device";
@@ -660,16 +608,16 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
660 return ERR_PTR(r); 608 return ERR_PTR(r);
661} 609}
662 610
663static struct priority_group *parse_priority_group(struct arg_set *as, 611static struct priority_group *parse_priority_group(struct dm_arg_set *as,
664 struct multipath *m) 612 struct multipath *m)
665{ 613{
666 static struct param _params[] = { 614 static struct dm_arg _args[] = {
667 {1, 1024, "invalid number of paths"}, 615 {1, 1024, "invalid number of paths"},
668 {0, 1024, "invalid number of selector args"} 616 {0, 1024, "invalid number of selector args"}
669 }; 617 };
670 618
671 int r; 619 int r;
672 unsigned i, nr_selector_args, nr_params; 620 unsigned i, nr_selector_args, nr_args;
673 struct priority_group *pg; 621 struct priority_group *pg;
674 struct dm_target *ti = m->ti; 622 struct dm_target *ti = m->ti;
675 623
@@ -693,26 +641,26 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
693 /* 641 /*
694 * read the paths 642 * read the paths
695 */ 643 */
696 r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error); 644 r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
697 if (r) 645 if (r)
698 goto bad; 646 goto bad;
699 647
700 r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error); 648 r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
701 if (r) 649 if (r)
702 goto bad; 650 goto bad;
703 651
704 nr_params = 1 + nr_selector_args; 652 nr_args = 1 + nr_selector_args;
705 for (i = 0; i < pg->nr_pgpaths; i++) { 653 for (i = 0; i < pg->nr_pgpaths; i++) {
706 struct pgpath *pgpath; 654 struct pgpath *pgpath;
707 struct arg_set path_args; 655 struct dm_arg_set path_args;
708 656
709 if (as->argc < nr_params) { 657 if (as->argc < nr_args) {
710 ti->error = "not enough path parameters"; 658 ti->error = "not enough path parameters";
711 r = -EINVAL; 659 r = -EINVAL;
712 goto bad; 660 goto bad;
713 } 661 }
714 662
715 path_args.argc = nr_params; 663 path_args.argc = nr_args;
716 path_args.argv = as->argv; 664 path_args.argv = as->argv;
717 665
718 pgpath = parse_path(&path_args, &pg->ps, ti); 666 pgpath = parse_path(&path_args, &pg->ps, ti);
@@ -723,7 +671,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
723 671
724 pgpath->pg = pg; 672 pgpath->pg = pg;
725 list_add_tail(&pgpath->list, &pg->pgpaths); 673 list_add_tail(&pgpath->list, &pg->pgpaths);
726 consume(as, nr_params); 674 dm_consume_args(as, nr_args);
727 } 675 }
728 676
729 return pg; 677 return pg;
@@ -733,28 +681,23 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
733 return ERR_PTR(r); 681 return ERR_PTR(r);
734} 682}
735 683
736static int parse_hw_handler(struct arg_set *as, struct multipath *m) 684static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
737{ 685{
738 unsigned hw_argc; 686 unsigned hw_argc;
739 int ret; 687 int ret;
740 struct dm_target *ti = m->ti; 688 struct dm_target *ti = m->ti;
741 689
742 static struct param _params[] = { 690 static struct dm_arg _args[] = {
743 {0, 1024, "invalid number of hardware handler args"}, 691 {0, 1024, "invalid number of hardware handler args"},
744 }; 692 };
745 693
746 if (read_param(_params, shift(as), &hw_argc, &ti->error)) 694 if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
747 return -EINVAL; 695 return -EINVAL;
748 696
749 if (!hw_argc) 697 if (!hw_argc)
750 return 0; 698 return 0;
751 699
752 if (hw_argc > as->argc) { 700 m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
753 ti->error = "not enough arguments for hardware handler";
754 return -EINVAL;
755 }
756
757 m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
758 request_module("scsi_dh_%s", m->hw_handler_name); 701 request_module("scsi_dh_%s", m->hw_handler_name);
759 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { 702 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
760 ti->error = "unknown hardware handler type"; 703 ti->error = "unknown hardware handler type";
@@ -778,7 +721,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
778 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) 721 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
779 j = sprintf(p, "%s", as->argv[i]); 722 j = sprintf(p, "%s", as->argv[i]);
780 } 723 }
781 consume(as, hw_argc - 1); 724 dm_consume_args(as, hw_argc - 1);
782 725
783 return 0; 726 return 0;
784fail: 727fail:
@@ -787,20 +730,20 @@ fail:
787 return ret; 730 return ret;
788} 731}
789 732
790static int parse_features(struct arg_set *as, struct multipath *m) 733static int parse_features(struct dm_arg_set *as, struct multipath *m)
791{ 734{
792 int r; 735 int r;
793 unsigned argc; 736 unsigned argc;
794 struct dm_target *ti = m->ti; 737 struct dm_target *ti = m->ti;
795 const char *param_name; 738 const char *arg_name;
796 739
797 static struct param _params[] = { 740 static struct dm_arg _args[] = {
798 {0, 5, "invalid number of feature args"}, 741 {0, 5, "invalid number of feature args"},
799 {1, 50, "pg_init_retries must be between 1 and 50"}, 742 {1, 50, "pg_init_retries must be between 1 and 50"},
800 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, 743 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
801 }; 744 };
802 745
803 r = read_param(_params, shift(as), &argc, &ti->error); 746 r = dm_read_arg_group(_args, as, &argc, &ti->error);
804 if (r) 747 if (r)
805 return -EINVAL; 748 return -EINVAL;
806 749
@@ -808,26 +751,24 @@ static int parse_features(struct arg_set *as, struct multipath *m)
808 return 0; 751 return 0;
809 752
810 do { 753 do {
811 param_name = shift(as); 754 arg_name = dm_shift_arg(as);
812 argc--; 755 argc--;
813 756
814 if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) { 757 if (!strcasecmp(arg_name, "queue_if_no_path")) {
815 r = queue_if_no_path(m, 1, 0); 758 r = queue_if_no_path(m, 1, 0);
816 continue; 759 continue;
817 } 760 }
818 761
819 if (!strnicmp(param_name, MESG_STR("pg_init_retries")) && 762 if (!strcasecmp(arg_name, "pg_init_retries") &&
820 (argc >= 1)) { 763 (argc >= 1)) {
821 r = read_param(_params + 1, shift(as), 764 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
822 &m->pg_init_retries, &ti->error);
823 argc--; 765 argc--;
824 continue; 766 continue;
825 } 767 }
826 768
827 if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) && 769 if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
828 (argc >= 1)) { 770 (argc >= 1)) {
829 r = read_param(_params + 2, shift(as), 771 r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
830 &m->pg_init_delay_msecs, &ti->error);
831 argc--; 772 argc--;
832 continue; 773 continue;
833 } 774 }
@@ -842,15 +783,15 @@ static int parse_features(struct arg_set *as, struct multipath *m)
842static int multipath_ctr(struct dm_target *ti, unsigned int argc, 783static int multipath_ctr(struct dm_target *ti, unsigned int argc,
843 char **argv) 784 char **argv)
844{ 785{
845 /* target parameters */ 786 /* target arguments */
846 static struct param _params[] = { 787 static struct dm_arg _args[] = {
847 {0, 1024, "invalid number of priority groups"}, 788 {0, 1024, "invalid number of priority groups"},
848 {0, 1024, "invalid initial priority group number"}, 789 {0, 1024, "invalid initial priority group number"},
849 }; 790 };
850 791
851 int r; 792 int r;
852 struct multipath *m; 793 struct multipath *m;
853 struct arg_set as; 794 struct dm_arg_set as;
854 unsigned pg_count = 0; 795 unsigned pg_count = 0;
855 unsigned next_pg_num; 796 unsigned next_pg_num;
856 797
@@ -871,11 +812,11 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
871 if (r) 812 if (r)
872 goto bad; 813 goto bad;
873 814
874 r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error); 815 r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
875 if (r) 816 if (r)
876 goto bad; 817 goto bad;
877 818
878 r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error); 819 r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
879 if (r) 820 if (r)
880 goto bad; 821 goto bad;
881 822
@@ -1505,10 +1446,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1505 } 1446 }
1506 1447
1507 if (argc == 1) { 1448 if (argc == 1) {
1508 if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { 1449 if (!strcasecmp(argv[0], "queue_if_no_path")) {
1509 r = queue_if_no_path(m, 1, 0); 1450 r = queue_if_no_path(m, 1, 0);
1510 goto out; 1451 goto out;
1511 } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) { 1452 } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1512 r = queue_if_no_path(m, 0, 0); 1453 r = queue_if_no_path(m, 0, 0);
1513 goto out; 1454 goto out;
1514 } 1455 }
@@ -1519,18 +1460,18 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1519 goto out; 1460 goto out;
1520 } 1461 }
1521 1462
1522 if (!strnicmp(argv[0], MESG_STR("disable_group"))) { 1463 if (!strcasecmp(argv[0], "disable_group")) {
1523 r = bypass_pg_num(m, argv[1], 1); 1464 r = bypass_pg_num(m, argv[1], 1);
1524 goto out; 1465 goto out;
1525 } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) { 1466 } else if (!strcasecmp(argv[0], "enable_group")) {
1526 r = bypass_pg_num(m, argv[1], 0); 1467 r = bypass_pg_num(m, argv[1], 0);
1527 goto out; 1468 goto out;
1528 } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) { 1469 } else if (!strcasecmp(argv[0], "switch_group")) {
1529 r = switch_pg_num(m, argv[1]); 1470 r = switch_pg_num(m, argv[1]);
1530 goto out; 1471 goto out;
1531 } else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) 1472 } else if (!strcasecmp(argv[0], "reinstate_path"))
1532 action = reinstate_path; 1473 action = reinstate_path;
1533 else if (!strnicmp(argv[0], MESG_STR("fail_path"))) 1474 else if (!strcasecmp(argv[0], "fail_path"))
1534 action = fail_path; 1475 action = fail_path;
1535 else { 1476 else {
1536 DMWARN("Unrecognised multipath message received."); 1477 DMWARN("Unrecognised multipath message received.");
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e5d8904fc8f..a002dd85db1 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -8,19 +8,19 @@
8#include <linux/slab.h> 8#include <linux/slab.h>
9 9
10#include "md.h" 10#include "md.h"
11#include "raid1.h"
11#include "raid5.h" 12#include "raid5.h"
12#include "dm.h"
13#include "bitmap.h" 13#include "bitmap.h"
14 14
15#include <linux/device-mapper.h>
16
15#define DM_MSG_PREFIX "raid" 17#define DM_MSG_PREFIX "raid"
16 18
17/* 19/*
18 * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then 20 * The following flags are used by dm-raid.c to set up the array state.
19 * make it so the flag doesn't set anything. 21 * They must be cleared before md_run is called.
20 */ 22 */
21#ifndef MD_SYNC_STATE_FORCED 23#define FirstUse 10 /* rdev flag */
22#define MD_SYNC_STATE_FORCED 0
23#endif
24 24
25struct raid_dev { 25struct raid_dev {
26 /* 26 /*
@@ -43,14 +43,15 @@ struct raid_dev {
43/* 43/*
44 * Flags for rs->print_flags field. 44 * Flags for rs->print_flags field.
45 */ 45 */
46#define DMPF_DAEMON_SLEEP 0x1 46#define DMPF_SYNC 0x1
47#define DMPF_MAX_WRITE_BEHIND 0x2 47#define DMPF_NOSYNC 0x2
48#define DMPF_SYNC 0x4 48#define DMPF_REBUILD 0x4
49#define DMPF_NOSYNC 0x8 49#define DMPF_DAEMON_SLEEP 0x8
50#define DMPF_STRIPE_CACHE 0x10 50#define DMPF_MIN_RECOVERY_RATE 0x10
51#define DMPF_MIN_RECOVERY_RATE 0x20 51#define DMPF_MAX_RECOVERY_RATE 0x20
52#define DMPF_MAX_RECOVERY_RATE 0x40 52#define DMPF_MAX_WRITE_BEHIND 0x40
53 53#define DMPF_STRIPE_CACHE 0x80
54#define DMPF_REGION_SIZE 0X100
54struct raid_set { 55struct raid_set {
55 struct dm_target *ti; 56 struct dm_target *ti;
56 57
@@ -72,6 +73,7 @@ static struct raid_type {
72 const unsigned level; /* RAID level. */ 73 const unsigned level; /* RAID level. */
73 const unsigned algorithm; /* RAID algorithm. */ 74 const unsigned algorithm; /* RAID algorithm. */
74} raid_types[] = { 75} raid_types[] = {
76 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
75 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, 77 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
76 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 78 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
77 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, 79 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -105,7 +107,8 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
105 } 107 }
106 108
107 sectors_per_dev = ti->len; 109 sectors_per_dev = ti->len;
108 if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { 110 if ((raid_type->level > 1) &&
111 sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
109 ti->error = "Target length not divisible by number of data devices"; 112 ti->error = "Target length not divisible by number of data devices";
110 return ERR_PTR(-EINVAL); 113 return ERR_PTR(-EINVAL);
111 } 114 }
@@ -147,9 +150,16 @@ static void context_free(struct raid_set *rs)
147{ 150{
148 int i; 151 int i;
149 152
150 for (i = 0; i < rs->md.raid_disks; i++) 153 for (i = 0; i < rs->md.raid_disks; i++) {
154 if (rs->dev[i].meta_dev)
155 dm_put_device(rs->ti, rs->dev[i].meta_dev);
156 if (rs->dev[i].rdev.sb_page)
157 put_page(rs->dev[i].rdev.sb_page);
158 rs->dev[i].rdev.sb_page = NULL;
159 rs->dev[i].rdev.sb_loaded = 0;
151 if (rs->dev[i].data_dev) 160 if (rs->dev[i].data_dev)
152 dm_put_device(rs->ti, rs->dev[i].data_dev); 161 dm_put_device(rs->ti, rs->dev[i].data_dev);
162 }
153 163
154 kfree(rs); 164 kfree(rs);
155} 165}
@@ -159,7 +169,16 @@ static void context_free(struct raid_set *rs)
159 * <meta_dev>: meta device name or '-' if missing 169 * <meta_dev>: meta device name or '-' if missing
160 * <data_dev>: data device name or '-' if missing 170 * <data_dev>: data device name or '-' if missing
161 * 171 *
162 * This code parses those words. 172 * The following are permitted:
173 * - -
174 * - <data_dev>
175 * <meta_dev> <data_dev>
176 *
177 * The following is not allowed:
178 * <meta_dev> -
179 *
180 * This code parses those words. If there is a failure,
181 * the caller must use context_free to unwind the operations.
163 */ 182 */
164static int dev_parms(struct raid_set *rs, char **argv) 183static int dev_parms(struct raid_set *rs, char **argv)
165{ 184{
@@ -182,8 +201,16 @@ static int dev_parms(struct raid_set *rs, char **argv)
182 rs->dev[i].rdev.mddev = &rs->md; 201 rs->dev[i].rdev.mddev = &rs->md;
183 202
184 if (strcmp(argv[0], "-")) { 203 if (strcmp(argv[0], "-")) {
185 rs->ti->error = "Metadata devices not supported"; 204 ret = dm_get_device(rs->ti, argv[0],
186 return -EINVAL; 205 dm_table_get_mode(rs->ti->table),
206 &rs->dev[i].meta_dev);
207 rs->ti->error = "RAID metadata device lookup failure";
208 if (ret)
209 return ret;
210
211 rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
212 if (!rs->dev[i].rdev.sb_page)
213 return -ENOMEM;
187 } 214 }
188 215
189 if (!strcmp(argv[1], "-")) { 216 if (!strcmp(argv[1], "-")) {
@@ -193,6 +220,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
193 return -EINVAL; 220 return -EINVAL;
194 } 221 }
195 222
223 rs->ti->error = "No data device supplied with metadata device";
224 if (rs->dev[i].meta_dev)
225 return -EINVAL;
226
196 continue; 227 continue;
197 } 228 }
198 229
@@ -204,6 +235,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
204 return ret; 235 return ret;
205 } 236 }
206 237
238 if (rs->dev[i].meta_dev) {
239 metadata_available = 1;
240 rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
241 }
207 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; 242 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
208 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); 243 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
209 if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 244 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
@@ -235,33 +270,109 @@ static int dev_parms(struct raid_set *rs, char **argv)
235} 270}
236 271
237/* 272/*
273 * validate_region_size
274 * @rs
275 * @region_size: region size in sectors. If 0, pick a size (4MiB default).
276 *
277 * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
278 * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
279 *
280 * Returns: 0 on success, -EINVAL on failure.
281 */
282static int validate_region_size(struct raid_set *rs, unsigned long region_size)
283{
284 unsigned long min_region_size = rs->ti->len / (1 << 21);
285
286 if (!region_size) {
287 /*
288 * Choose a reasonable default. All figures in sectors.
289 */
290 if (min_region_size > (1 << 13)) {
291 DMINFO("Choosing default region size of %lu sectors",
292 region_size);
293 region_size = min_region_size;
294 } else {
295 DMINFO("Choosing default region size of 4MiB");
296 region_size = 1 << 13; /* sectors */
297 }
298 } else {
299 /*
300 * Validate user-supplied value.
301 */
302 if (region_size > rs->ti->len) {
303 rs->ti->error = "Supplied region size is too large";
304 return -EINVAL;
305 }
306
307 if (region_size < min_region_size) {
308 DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
309 region_size, min_region_size);
310 rs->ti->error = "Supplied region size is too small";
311 return -EINVAL;
312 }
313
314 if (!is_power_of_2(region_size)) {
315 rs->ti->error = "Region size is not a power of 2";
316 return -EINVAL;
317 }
318
319 if (region_size < rs->md.chunk_sectors) {
320 rs->ti->error = "Region size is smaller than the chunk size";
321 return -EINVAL;
322 }
323 }
324
325 /*
326 * Convert sectors to bytes.
327 */
328 rs->md.bitmap_info.chunksize = (region_size << 9);
329
330 return 0;
331}
332
333/*
238 * Possible arguments are... 334 * Possible arguments are...
239 * RAID456:
240 * <chunk_size> [optional_args] 335 * <chunk_size> [optional_args]
241 * 336 *
242 * Optional args: 337 * Argument definitions
243 * [[no]sync] Force or prevent recovery of the entire array 338 * <chunk_size> The number of sectors per disk that
339 * will form the "stripe"
340 * [[no]sync] Force or prevent recovery of the
341 * entire array
244 * [rebuild <idx>] Rebuild the drive indicated by the index 342 * [rebuild <idx>] Rebuild the drive indicated by the index
245 * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits 343 * [daemon_sleep <ms>] Time between bitmap daemon work to
344 * clear bits
246 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization 345 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
247 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization 346 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
347 * [write_mostly <idx>] Indicate a write mostly drive via index
248 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 348 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
249 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 349 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
350 * [region_size <sectors>] Defines granularity of bitmap
250 */ 351 */
251static int parse_raid_params(struct raid_set *rs, char **argv, 352static int parse_raid_params(struct raid_set *rs, char **argv,
252 unsigned num_raid_params) 353 unsigned num_raid_params)
253{ 354{
254 unsigned i, rebuild_cnt = 0; 355 unsigned i, rebuild_cnt = 0;
255 unsigned long value; 356 unsigned long value, region_size = 0;
256 char *key; 357 char *key;
257 358
258 /* 359 /*
259 * First, parse the in-order required arguments 360 * First, parse the in-order required arguments
361 * "chunk_size" is the only argument of this type.
260 */ 362 */
261 if ((strict_strtoul(argv[0], 10, &value) < 0) || 363 if ((strict_strtoul(argv[0], 10, &value) < 0)) {
262 !is_power_of_2(value) || (value < 8)) {
263 rs->ti->error = "Bad chunk size"; 364 rs->ti->error = "Bad chunk size";
264 return -EINVAL; 365 return -EINVAL;
366 } else if (rs->raid_type->level == 1) {
367 if (value)
368 DMERR("Ignoring chunk size parameter for RAID 1");
369 value = 0;
370 } else if (!is_power_of_2(value)) {
371 rs->ti->error = "Chunk size must be a power of 2";
372 return -EINVAL;
373 } else if (value < 8) {
374 rs->ti->error = "Chunk size value is too small";
375 return -EINVAL;
265 } 376 }
266 377
267 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; 378 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
@@ -269,22 +380,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
269 num_raid_params--; 380 num_raid_params--;
270 381
271 /* 382 /*
272 * Second, parse the unordered optional arguments 383 * We set each individual device as In_sync with a completed
384 * 'recovery_offset'. If there has been a device failure or
385 * replacement then one of the following cases applies:
386 *
387 * 1) User specifies 'rebuild'.
388 * - Device is reset when param is read.
389 * 2) A new device is supplied.
390 * - No matching superblock found, resets device.
391 * 3) Device failure was transient and returns on reload.
392 * - Failure noticed, resets device for bitmap replay.
393 * 4) Device hadn't completed recovery after previous failure.
394 * - Superblock is read and overrides recovery_offset.
395 *
396 * What is found in the superblocks of the devices is always
397 * authoritative, unless 'rebuild' or '[no]sync' was specified.
273 */ 398 */
274 for (i = 0; i < rs->md.raid_disks; i++) 399 for (i = 0; i < rs->md.raid_disks; i++) {
275 set_bit(In_sync, &rs->dev[i].rdev.flags); 400 set_bit(In_sync, &rs->dev[i].rdev.flags);
401 rs->dev[i].rdev.recovery_offset = MaxSector;
402 }
276 403
404 /*
405 * Second, parse the unordered optional arguments
406 */
277 for (i = 0; i < num_raid_params; i++) { 407 for (i = 0; i < num_raid_params; i++) {
278 if (!strcmp(argv[i], "nosync")) { 408 if (!strcasecmp(argv[i], "nosync")) {
279 rs->md.recovery_cp = MaxSector; 409 rs->md.recovery_cp = MaxSector;
280 rs->print_flags |= DMPF_NOSYNC; 410 rs->print_flags |= DMPF_NOSYNC;
281 rs->md.flags |= MD_SYNC_STATE_FORCED;
282 continue; 411 continue;
283 } 412 }
284 if (!strcmp(argv[i], "sync")) { 413 if (!strcasecmp(argv[i], "sync")) {
285 rs->md.recovery_cp = 0; 414 rs->md.recovery_cp = 0;
286 rs->print_flags |= DMPF_SYNC; 415 rs->print_flags |= DMPF_SYNC;
287 rs->md.flags |= MD_SYNC_STATE_FORCED;
288 continue; 416 continue;
289 } 417 }
290 418
@@ -300,9 +428,13 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
300 return -EINVAL; 428 return -EINVAL;
301 } 429 }
302 430
303 if (!strcmp(key, "rebuild")) { 431 if (!strcasecmp(key, "rebuild")) {
304 if (++rebuild_cnt > rs->raid_type->parity_devs) { 432 rebuild_cnt++;
305 rs->ti->error = "Too many rebuild drives given"; 433 if (((rs->raid_type->level != 1) &&
434 (rebuild_cnt > rs->raid_type->parity_devs)) ||
435 ((rs->raid_type->level == 1) &&
436 (rebuild_cnt > (rs->md.raid_disks - 1)))) {
437 rs->ti->error = "Too many rebuild devices specified for given RAID type";
306 return -EINVAL; 438 return -EINVAL;
307 } 439 }
308 if (value > rs->md.raid_disks) { 440 if (value > rs->md.raid_disks) {
@@ -311,7 +443,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
311 } 443 }
312 clear_bit(In_sync, &rs->dev[value].rdev.flags); 444 clear_bit(In_sync, &rs->dev[value].rdev.flags);
313 rs->dev[value].rdev.recovery_offset = 0; 445 rs->dev[value].rdev.recovery_offset = 0;
314 } else if (!strcmp(key, "max_write_behind")) { 446 rs->print_flags |= DMPF_REBUILD;
447 } else if (!strcasecmp(key, "write_mostly")) {
448 if (rs->raid_type->level != 1) {
449 rs->ti->error = "write_mostly option is only valid for RAID1";
450 return -EINVAL;
451 }
452 if (value > rs->md.raid_disks) {
453 rs->ti->error = "Invalid write_mostly drive index given";
454 return -EINVAL;
455 }
456 set_bit(WriteMostly, &rs->dev[value].rdev.flags);
457 } else if (!strcasecmp(key, "max_write_behind")) {
458 if (rs->raid_type->level != 1) {
459 rs->ti->error = "max_write_behind option is only valid for RAID1";
460 return -EINVAL;
461 }
315 rs->print_flags |= DMPF_MAX_WRITE_BEHIND; 462 rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
316 463
317 /* 464 /*
@@ -324,14 +471,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
324 return -EINVAL; 471 return -EINVAL;
325 } 472 }
326 rs->md.bitmap_info.max_write_behind = value; 473 rs->md.bitmap_info.max_write_behind = value;
327 } else if (!strcmp(key, "daemon_sleep")) { 474 } else if (!strcasecmp(key, "daemon_sleep")) {
328 rs->print_flags |= DMPF_DAEMON_SLEEP; 475 rs->print_flags |= DMPF_DAEMON_SLEEP;
329 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { 476 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
330 rs->ti->error = "daemon sleep period out of range"; 477 rs->ti->error = "daemon sleep period out of range";
331 return -EINVAL; 478 return -EINVAL;
332 } 479 }
333 rs->md.bitmap_info.daemon_sleep = value; 480 rs->md.bitmap_info.daemon_sleep = value;
334 } else if (!strcmp(key, "stripe_cache")) { 481 } else if (!strcasecmp(key, "stripe_cache")) {
335 rs->print_flags |= DMPF_STRIPE_CACHE; 482 rs->print_flags |= DMPF_STRIPE_CACHE;
336 483
337 /* 484 /*
@@ -348,20 +495,23 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
348 rs->ti->error = "Bad stripe_cache size"; 495 rs->ti->error = "Bad stripe_cache size";
349 return -EINVAL; 496 return -EINVAL;
350 } 497 }
351 } else if (!strcmp(key, "min_recovery_rate")) { 498 } else if (!strcasecmp(key, "min_recovery_rate")) {
352 rs->print_flags |= DMPF_MIN_RECOVERY_RATE; 499 rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
353 if (value > INT_MAX) { 500 if (value > INT_MAX) {
354 rs->ti->error = "min_recovery_rate out of range"; 501 rs->ti->error = "min_recovery_rate out of range";
355 return -EINVAL; 502 return -EINVAL;
356 } 503 }
357 rs->md.sync_speed_min = (int)value; 504 rs->md.sync_speed_min = (int)value;
358 } else if (!strcmp(key, "max_recovery_rate")) { 505 } else if (!strcasecmp(key, "max_recovery_rate")) {
359 rs->print_flags |= DMPF_MAX_RECOVERY_RATE; 506 rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
360 if (value > INT_MAX) { 507 if (value > INT_MAX) {
361 rs->ti->error = "max_recovery_rate out of range"; 508 rs->ti->error = "max_recovery_rate out of range";
362 return -EINVAL; 509 return -EINVAL;
363 } 510 }
364 rs->md.sync_speed_max = (int)value; 511 rs->md.sync_speed_max = (int)value;
512 } else if (!strcasecmp(key, "region_size")) {
513 rs->print_flags |= DMPF_REGION_SIZE;
514 region_size = value;
365 } else { 515 } else {
366 DMERR("Unable to parse RAID parameter: %s", key); 516 DMERR("Unable to parse RAID parameter: %s", key);
367 rs->ti->error = "Unable to parse RAID parameters"; 517 rs->ti->error = "Unable to parse RAID parameters";
@@ -369,6 +519,19 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
369 } 519 }
370 } 520 }
371 521
522 if (validate_region_size(rs, region_size))
523 return -EINVAL;
524
525 if (rs->md.chunk_sectors)
526 rs->ti->split_io = rs->md.chunk_sectors;
527 else
528 rs->ti->split_io = region_size;
529
530 if (rs->md.chunk_sectors)
531 rs->ti->split_io = rs->md.chunk_sectors;
532 else
533 rs->ti->split_io = region_size;
534
372 /* Assume there are no metadata devices until the drives are parsed */ 535 /* Assume there are no metadata devices until the drives are parsed */
373 rs->md.persistent = 0; 536 rs->md.persistent = 0;
374 rs->md.external = 1; 537 rs->md.external = 1;
@@ -387,17 +550,351 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
387{ 550{
388 struct raid_set *rs = container_of(cb, struct raid_set, callbacks); 551 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
389 552
553 if (rs->raid_type->level == 1)
554 return md_raid1_congested(&rs->md, bits);
555
390 return md_raid5_congested(&rs->md, bits); 556 return md_raid5_congested(&rs->md, bits);
391} 557}
392 558
393/* 559/*
560 * This structure is never routinely used by userspace, unlike md superblocks.
561 * Devices with this superblock should only ever be accessed via device-mapper.
562 */
563#define DM_RAID_MAGIC 0x64526D44
564struct dm_raid_superblock {
565 __le32 magic; /* "DmRd" */
566 __le32 features; /* Used to indicate possible future changes */
567
568 __le32 num_devices; /* Number of devices in this array. (Max 64) */
569 __le32 array_position; /* The position of this drive in the array */
570
571 __le64 events; /* Incremented by md when superblock updated */
572 __le64 failed_devices; /* Bit field of devices to indicate failures */
573
574 /*
575 * This offset tracks the progress of the repair or replacement of
576 * an individual drive.
577 */
578 __le64 disk_recovery_offset;
579
580 /*
581 * This offset tracks the progress of the initial array
582 * synchronisation/parity calculation.
583 */
584 __le64 array_resync_offset;
585
586 /*
587 * RAID characteristics
588 */
589 __le32 level;
590 __le32 layout;
591 __le32 stripe_sectors;
592
593 __u8 pad[452]; /* Round struct to 512 bytes. */
594 /* Always set to 0 when writing. */
595} __packed;
596
597static int read_disk_sb(mdk_rdev_t *rdev, int size)
598{
599 BUG_ON(!rdev->sb_page);
600
601 if (rdev->sb_loaded)
602 return 0;
603
604 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
605 DMERR("Failed to read device superblock");
606 return -EINVAL;
607 }
608
609 rdev->sb_loaded = 1;
610
611 return 0;
612}
613
614static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
615{
616 mdk_rdev_t *r, *t;
617 uint64_t failed_devices;
618 struct dm_raid_superblock *sb;
619
620 sb = page_address(rdev->sb_page);
621 failed_devices = le64_to_cpu(sb->failed_devices);
622
623 rdev_for_each(r, t, mddev)
624 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
625 failed_devices |= (1ULL << r->raid_disk);
626
627 memset(sb, 0, sizeof(*sb));
628
629 sb->magic = cpu_to_le32(DM_RAID_MAGIC);
630 sb->features = cpu_to_le32(0); /* No features yet */
631
632 sb->num_devices = cpu_to_le32(mddev->raid_disks);
633 sb->array_position = cpu_to_le32(rdev->raid_disk);
634
635 sb->events = cpu_to_le64(mddev->events);
636 sb->failed_devices = cpu_to_le64(failed_devices);
637
638 sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
639 sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
640
641 sb->level = cpu_to_le32(mddev->level);
642 sb->layout = cpu_to_le32(mddev->layout);
643 sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
644}
645
646/*
647 * super_load
648 *
649 * This function creates a superblock if one is not found on the device
650 * and will decide which superblock to use if there's a choice.
651 *
652 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
653 */
654static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
655{
656 int ret;
657 struct dm_raid_superblock *sb;
658 struct dm_raid_superblock *refsb;
659 uint64_t events_sb, events_refsb;
660
661 rdev->sb_start = 0;
662 rdev->sb_size = sizeof(*sb);
663
664 ret = read_disk_sb(rdev, rdev->sb_size);
665 if (ret)
666 return ret;
667
668 sb = page_address(rdev->sb_page);
669 if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
670 super_sync(rdev->mddev, rdev);
671
672 set_bit(FirstUse, &rdev->flags);
673
674 /* Force writing of superblocks to disk */
675 set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
676
677 /* Any superblock is better than none, choose that if given */
678 return refdev ? 0 : 1;
679 }
680
681 if (!refdev)
682 return 1;
683
684 events_sb = le64_to_cpu(sb->events);
685
686 refsb = page_address(refdev->sb_page);
687 events_refsb = le64_to_cpu(refsb->events);
688
689 return (events_sb > events_refsb) ? 1 : 0;
690}
691
692static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
693{
694 int role;
695 struct raid_set *rs = container_of(mddev, struct raid_set, md);
696 uint64_t events_sb;
697 uint64_t failed_devices;
698 struct dm_raid_superblock *sb;
699 uint32_t new_devs = 0;
700 uint32_t rebuilds = 0;
701 mdk_rdev_t *r, *t;
702 struct dm_raid_superblock *sb2;
703
704 sb = page_address(rdev->sb_page);
705 events_sb = le64_to_cpu(sb->events);
706 failed_devices = le64_to_cpu(sb->failed_devices);
707
708 /*
709 * Initialise to 1 if this is a new superblock.
710 */
711 mddev->events = events_sb ? : 1;
712
713 /*
714 * Reshaping is not currently allowed
715 */
716 if ((le32_to_cpu(sb->level) != mddev->level) ||
717 (le32_to_cpu(sb->layout) != mddev->layout) ||
718 (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
719 DMERR("Reshaping arrays not yet supported.");
720 return -EINVAL;
721 }
722
723 /* We can only change the number of devices in RAID1 right now */
724 if ((rs->raid_type->level != 1) &&
725 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
726 DMERR("Reshaping arrays not yet supported.");
727 return -EINVAL;
728 }
729
730 if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
731 mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
732
733 /*
734 * During load, we set FirstUse if a new superblock was written.
735 * There are two reasons we might not have a superblock:
736 * 1) The array is brand new - in which case, all of the
737 * devices must have their In_sync bit set. Also,
738 * recovery_cp must be 0, unless forced.
739 * 2) This is a new device being added to an old array
740 * and the new device needs to be rebuilt - in which
741 * case the In_sync bit will /not/ be set and
742 * recovery_cp must be MaxSector.
743 */
744 rdev_for_each(r, t, mddev) {
745 if (!test_bit(In_sync, &r->flags)) {
746 if (!test_bit(FirstUse, &r->flags))
747 DMERR("Superblock area of "
748 "rebuild device %d should have been "
749 "cleared.", r->raid_disk);
750 set_bit(FirstUse, &r->flags);
751 rebuilds++;
752 } else if (test_bit(FirstUse, &r->flags))
753 new_devs++;
754 }
755
756 if (!rebuilds) {
757 if (new_devs == mddev->raid_disks) {
758 DMINFO("Superblocks created for new array");
759 set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
760 } else if (new_devs) {
761 DMERR("New device injected "
762 "into existing array without 'rebuild' "
763 "parameter specified");
764 return -EINVAL;
765 }
766 } else if (new_devs) {
767 DMERR("'rebuild' devices cannot be "
768 "injected into an array with other first-time devices");
769 return -EINVAL;
770 } else if (mddev->recovery_cp != MaxSector) {
771 DMERR("'rebuild' specified while array is not in-sync");
772 return -EINVAL;
773 }
774
775 /*
776 * Now we set the Faulty bit for those devices that are
777 * recorded in the superblock as failed.
778 */
779 rdev_for_each(r, t, mddev) {
780 if (!r->sb_page)
781 continue;
782 sb2 = page_address(r->sb_page);
783 sb2->failed_devices = 0;
784
785 /*
786 * Check for any device re-ordering.
787 */
788 if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
789 role = le32_to_cpu(sb2->array_position);
790 if (role != r->raid_disk) {
791 if (rs->raid_type->level != 1) {
792 rs->ti->error = "Cannot change device "
793 "positions in RAID array";
794 return -EINVAL;
795 }
796 DMINFO("RAID1 device #%d now at position #%d",
797 role, r->raid_disk);
798 }
799
800 /*
801 * Partial recovery is performed on
802 * returning failed devices.
803 */
804 if (failed_devices & (1 << role))
805 set_bit(Faulty, &r->flags);
806 }
807 }
808
809 return 0;
810}
811
812static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
813{
814 struct dm_raid_superblock *sb = page_address(rdev->sb_page);
815
816 /*
817 * If mddev->events is not set, we know we have not yet initialized
818 * the array.
819 */
820 if (!mddev->events && super_init_validation(mddev, rdev))
821 return -EINVAL;
822
823 mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
824 rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
825 if (!test_bit(FirstUse, &rdev->flags)) {
826 rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
827 if (rdev->recovery_offset != MaxSector)
828 clear_bit(In_sync, &rdev->flags);
829 }
830
831 /*
832 * If a device comes back, set it as not In_sync and no longer faulty.
833 */
834 if (test_bit(Faulty, &rdev->flags)) {
835 clear_bit(Faulty, &rdev->flags);
836 clear_bit(In_sync, &rdev->flags);
837 rdev->saved_raid_disk = rdev->raid_disk;
838 rdev->recovery_offset = 0;
839 }
840
841 clear_bit(FirstUse, &rdev->flags);
842
843 return 0;
844}
845
846/*
847 * Analyse superblocks and select the freshest.
848 */
849static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
850{
851 int ret;
852 mdk_rdev_t *rdev, *freshest, *tmp;
853 mddev_t *mddev = &rs->md;
854
855 freshest = NULL;
856 rdev_for_each(rdev, tmp, mddev) {
857 if (!rdev->meta_bdev)
858 continue;
859
860 ret = super_load(rdev, freshest);
861
862 switch (ret) {
863 case 1:
864 freshest = rdev;
865 break;
866 case 0:
867 break;
868 default:
869 ti->error = "Failed to load superblock";
870 return ret;
871 }
872 }
873
874 if (!freshest)
875 return 0;
876
877 /*
878 * Validation of the freshest device provides the source of
879 * validation for the remaining devices.
880 */
881 ti->error = "Unable to assemble array: Invalid superblocks";
882 if (super_validate(mddev, freshest))
883 return -EINVAL;
884
885 rdev_for_each(rdev, tmp, mddev)
886 if ((rdev != freshest) && super_validate(mddev, rdev))
887 return -EINVAL;
888
889 return 0;
890}
891
892/*
394 * Construct a RAID4/5/6 mapping: 893 * Construct a RAID4/5/6 mapping:
395 * Args: 894 * Args:
396 * <raid_type> <#raid_params> <raid_params> \ 895 * <raid_type> <#raid_params> <raid_params> \
397 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } 896 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
398 * 897 *
399 * ** metadata devices are not supported yet, use '-' instead **
400 *
401 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for 898 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
402 * details on possible <raid_params>. 899 * details on possible <raid_params>.
403 */ 900 */
@@ -465,8 +962,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
465 if (ret) 962 if (ret)
466 goto bad; 963 goto bad;
467 964
965 rs->md.sync_super = super_sync;
966 ret = analyse_superblocks(ti, rs);
967 if (ret)
968 goto bad;
969
468 INIT_WORK(&rs->md.event_work, do_table_event); 970 INIT_WORK(&rs->md.event_work, do_table_event);
469 ti->split_io = rs->md.chunk_sectors;
470 ti->private = rs; 971 ti->private = rs;
471 972
472 mutex_lock(&rs->md.reconfig_mutex); 973 mutex_lock(&rs->md.reconfig_mutex);
@@ -482,6 +983,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
482 rs->callbacks.congested_fn = raid_is_congested; 983 rs->callbacks.congested_fn = raid_is_congested;
483 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 984 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
484 985
986 mddev_suspend(&rs->md);
485 return 0; 987 return 0;
486 988
487bad: 989bad:
@@ -546,12 +1048,17 @@ static int raid_status(struct dm_target *ti, status_type_t type,
546 break; 1048 break;
547 case STATUSTYPE_TABLE: 1049 case STATUSTYPE_TABLE:
548 /* The string you would use to construct this array */ 1050 /* The string you would use to construct this array */
549 for (i = 0; i < rs->md.raid_disks; i++) 1051 for (i = 0; i < rs->md.raid_disks; i++) {
550 if (rs->dev[i].data_dev && 1052 if ((rs->print_flags & DMPF_REBUILD) &&
1053 rs->dev[i].data_dev &&
551 !test_bit(In_sync, &rs->dev[i].rdev.flags)) 1054 !test_bit(In_sync, &rs->dev[i].rdev.flags))
552 raid_param_cnt++; /* for rebuilds */ 1055 raid_param_cnt += 2; /* for rebuilds */
1056 if (rs->dev[i].data_dev &&
1057 test_bit(WriteMostly, &rs->dev[i].rdev.flags))
1058 raid_param_cnt += 2;
1059 }
553 1060
554 raid_param_cnt += (hweight64(rs->print_flags) * 2); 1061 raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
555 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) 1062 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
556 raid_param_cnt--; 1063 raid_param_cnt--;
557 1064
@@ -565,7 +1072,8 @@ static int raid_status(struct dm_target *ti, status_type_t type,
565 DMEMIT(" nosync"); 1072 DMEMIT(" nosync");
566 1073
567 for (i = 0; i < rs->md.raid_disks; i++) 1074 for (i = 0; i < rs->md.raid_disks; i++)
568 if (rs->dev[i].data_dev && 1075 if ((rs->print_flags & DMPF_REBUILD) &&
1076 rs->dev[i].data_dev &&
569 !test_bit(In_sync, &rs->dev[i].rdev.flags)) 1077 !test_bit(In_sync, &rs->dev[i].rdev.flags))
570 DMEMIT(" rebuild %u", i); 1078 DMEMIT(" rebuild %u", i);
571 1079
@@ -579,6 +1087,11 @@ static int raid_status(struct dm_target *ti, status_type_t type,
579 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) 1087 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
580 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); 1088 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
581 1089
1090 for (i = 0; i < rs->md.raid_disks; i++)
1091 if (rs->dev[i].data_dev &&
1092 test_bit(WriteMostly, &rs->dev[i].rdev.flags))
1093 DMEMIT(" write_mostly %u", i);
1094
582 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) 1095 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
583 DMEMIT(" max_write_behind %lu", 1096 DMEMIT(" max_write_behind %lu",
584 rs->md.bitmap_info.max_write_behind); 1097 rs->md.bitmap_info.max_write_behind);
@@ -591,9 +1104,16 @@ static int raid_status(struct dm_target *ti, status_type_t type,
591 conf ? conf->max_nr_stripes * 2 : 0); 1104 conf ? conf->max_nr_stripes * 2 : 0);
592 } 1105 }
593 1106
1107 if (rs->print_flags & DMPF_REGION_SIZE)
1108 DMEMIT(" region_size %lu",
1109 rs->md.bitmap_info.chunksize >> 9);
1110
594 DMEMIT(" %d", rs->md.raid_disks); 1111 DMEMIT(" %d", rs->md.raid_disks);
595 for (i = 0; i < rs->md.raid_disks; i++) { 1112 for (i = 0; i < rs->md.raid_disks; i++) {
596 DMEMIT(" -"); /* metadata device */ 1113 if (rs->dev[i].meta_dev)
1114 DMEMIT(" %s", rs->dev[i].meta_dev->name);
1115 else
1116 DMEMIT(" -");
597 1117
598 if (rs->dev[i].data_dev) 1118 if (rs->dev[i].data_dev)
599 DMEMIT(" %s", rs->dev[i].data_dev->name); 1119 DMEMIT(" %s", rs->dev[i].data_dev->name);
@@ -650,12 +1170,13 @@ static void raid_resume(struct dm_target *ti)
650{ 1170{
651 struct raid_set *rs = ti->private; 1171 struct raid_set *rs = ti->private;
652 1172
1173 bitmap_load(&rs->md);
653 mddev_resume(&rs->md); 1174 mddev_resume(&rs->md);
654} 1175}
655 1176
656static struct target_type raid_target = { 1177static struct target_type raid_target = {
657 .name = "raid", 1178 .name = "raid",
658 .version = {1, 0, 0}, 1179 .version = {1, 1, 0},
659 .module = THIS_MODULE, 1180 .module = THIS_MODULE,
660 .ctr = raid_ctr, 1181 .ctr = raid_ctr,
661 .dtr = raid_dtr, 1182 .dtr = raid_dtr,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 135c2f1fdbf..d1f1d701710 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -58,25 +58,30 @@
58#define NUM_SNAPSHOT_HDR_CHUNKS 1 58#define NUM_SNAPSHOT_HDR_CHUNKS 1
59 59
60struct disk_header { 60struct disk_header {
61 uint32_t magic; 61 __le32 magic;
62 62
63 /* 63 /*
64 * Is this snapshot valid. There is no way of recovering 64 * Is this snapshot valid. There is no way of recovering
65 * an invalid snapshot. 65 * an invalid snapshot.
66 */ 66 */
67 uint32_t valid; 67 __le32 valid;
68 68
69 /* 69 /*
70 * Simple, incrementing version. no backward 70 * Simple, incrementing version. no backward
71 * compatibility. 71 * compatibility.
72 */ 72 */
73 uint32_t version; 73 __le32 version;
74 74
75 /* In sectors */ 75 /* In sectors */
76 uint32_t chunk_size; 76 __le32 chunk_size;
77}; 77} __packed;
78 78
79struct disk_exception { 79struct disk_exception {
80 __le64 old_chunk;
81 __le64 new_chunk;
82} __packed;
83
84struct core_exception {
80 uint64_t old_chunk; 85 uint64_t old_chunk;
81 uint64_t new_chunk; 86 uint64_t new_chunk;
82}; 87};
@@ -169,10 +174,9 @@ static int alloc_area(struct pstore *ps)
169 if (!ps->area) 174 if (!ps->area)
170 goto err_area; 175 goto err_area;
171 176
172 ps->zero_area = vmalloc(len); 177 ps->zero_area = vzalloc(len);
173 if (!ps->zero_area) 178 if (!ps->zero_area)
174 goto err_zero_area; 179 goto err_zero_area;
175 memset(ps->zero_area, 0, len);
176 180
177 ps->header_area = vmalloc(len); 181 ps->header_area = vmalloc(len);
178 if (!ps->header_area) 182 if (!ps->header_area)
@@ -396,32 +400,32 @@ static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
396} 400}
397 401
398static void read_exception(struct pstore *ps, 402static void read_exception(struct pstore *ps,
399 uint32_t index, struct disk_exception *result) 403 uint32_t index, struct core_exception *result)
400{ 404{
401 struct disk_exception *e = get_exception(ps, index); 405 struct disk_exception *de = get_exception(ps, index);
402 406
403 /* copy it */ 407 /* copy it */
404 result->old_chunk = le64_to_cpu(e->old_chunk); 408 result->old_chunk = le64_to_cpu(de->old_chunk);
405 result->new_chunk = le64_to_cpu(e->new_chunk); 409 result->new_chunk = le64_to_cpu(de->new_chunk);
406} 410}
407 411
408static void write_exception(struct pstore *ps, 412static void write_exception(struct pstore *ps,
409 uint32_t index, struct disk_exception *de) 413 uint32_t index, struct core_exception *e)
410{ 414{
411 struct disk_exception *e = get_exception(ps, index); 415 struct disk_exception *de = get_exception(ps, index);
412 416
413 /* copy it */ 417 /* copy it */
414 e->old_chunk = cpu_to_le64(de->old_chunk); 418 de->old_chunk = cpu_to_le64(e->old_chunk);
415 e->new_chunk = cpu_to_le64(de->new_chunk); 419 de->new_chunk = cpu_to_le64(e->new_chunk);
416} 420}
417 421
418static void clear_exception(struct pstore *ps, uint32_t index) 422static void clear_exception(struct pstore *ps, uint32_t index)
419{ 423{
420 struct disk_exception *e = get_exception(ps, index); 424 struct disk_exception *de = get_exception(ps, index);
421 425
422 /* clear it */ 426 /* clear it */
423 e->old_chunk = 0; 427 de->old_chunk = 0;
424 e->new_chunk = 0; 428 de->new_chunk = 0;
425} 429}
426 430
427/* 431/*
@@ -437,13 +441,13 @@ static int insert_exceptions(struct pstore *ps,
437{ 441{
438 int r; 442 int r;
439 unsigned int i; 443 unsigned int i;
440 struct disk_exception de; 444 struct core_exception e;
441 445
442 /* presume the area is full */ 446 /* presume the area is full */
443 *full = 1; 447 *full = 1;
444 448
445 for (i = 0; i < ps->exceptions_per_area; i++) { 449 for (i = 0; i < ps->exceptions_per_area; i++) {
446 read_exception(ps, i, &de); 450 read_exception(ps, i, &e);
447 451
448 /* 452 /*
449 * If the new_chunk is pointing at the start of 453 * If the new_chunk is pointing at the start of
@@ -451,7 +455,7 @@ static int insert_exceptions(struct pstore *ps,
451 * is we know that we've hit the end of the 455 * is we know that we've hit the end of the
452 * exceptions. Therefore the area is not full. 456 * exceptions. Therefore the area is not full.
453 */ 457 */
454 if (de.new_chunk == 0LL) { 458 if (e.new_chunk == 0LL) {
455 ps->current_committed = i; 459 ps->current_committed = i;
456 *full = 0; 460 *full = 0;
457 break; 461 break;
@@ -460,13 +464,13 @@ static int insert_exceptions(struct pstore *ps,
460 /* 464 /*
461 * Keep track of the start of the free chunks. 465 * Keep track of the start of the free chunks.
462 */ 466 */
463 if (ps->next_free <= de.new_chunk) 467 if (ps->next_free <= e.new_chunk)
464 ps->next_free = de.new_chunk + 1; 468 ps->next_free = e.new_chunk + 1;
465 469
466 /* 470 /*
467 * Otherwise we add the exception to the snapshot. 471 * Otherwise we add the exception to the snapshot.
468 */ 472 */
469 r = callback(callback_context, de.old_chunk, de.new_chunk); 473 r = callback(callback_context, e.old_chunk, e.new_chunk);
470 if (r) 474 if (r)
471 return r; 475 return r;
472 } 476 }
@@ -563,7 +567,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
563 ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / 567 ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
564 sizeof(struct disk_exception); 568 sizeof(struct disk_exception);
565 ps->callbacks = dm_vcalloc(ps->exceptions_per_area, 569 ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
566 sizeof(*ps->callbacks)); 570 sizeof(*ps->callbacks));
567 if (!ps->callbacks) 571 if (!ps->callbacks)
568 return -ENOMEM; 572 return -ENOMEM;
569 573
@@ -641,12 +645,12 @@ static void persistent_commit_exception(struct dm_exception_store *store,
641{ 645{
642 unsigned int i; 646 unsigned int i;
643 struct pstore *ps = get_info(store); 647 struct pstore *ps = get_info(store);
644 struct disk_exception de; 648 struct core_exception ce;
645 struct commit_callback *cb; 649 struct commit_callback *cb;
646 650
647 de.old_chunk = e->old_chunk; 651 ce.old_chunk = e->old_chunk;
648 de.new_chunk = e->new_chunk; 652 ce.new_chunk = e->new_chunk;
649 write_exception(ps, ps->current_committed++, &de); 653 write_exception(ps, ps->current_committed++, &ce);
650 654
651 /* 655 /*
652 * Add the callback to the back of the array. This code 656 * Add the callback to the back of the array. This code
@@ -670,7 +674,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
670 * If we completely filled the current area, then wipe the next one. 674 * If we completely filled the current area, then wipe the next one.
671 */ 675 */
672 if ((ps->current_committed == ps->exceptions_per_area) && 676 if ((ps->current_committed == ps->exceptions_per_area) &&
673 zero_disk_area(ps, ps->current_area + 1)) 677 zero_disk_area(ps, ps->current_area + 1))
674 ps->valid = 0; 678 ps->valid = 0;
675 679
676 /* 680 /*
@@ -701,7 +705,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
701 chunk_t *last_new_chunk) 705 chunk_t *last_new_chunk)
702{ 706{
703 struct pstore *ps = get_info(store); 707 struct pstore *ps = get_info(store);
704 struct disk_exception de; 708 struct core_exception ce;
705 int nr_consecutive; 709 int nr_consecutive;
706 int r; 710 int r;
707 711
@@ -722,9 +726,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
722 ps->current_committed = ps->exceptions_per_area; 726 ps->current_committed = ps->exceptions_per_area;
723 } 727 }
724 728
725 read_exception(ps, ps->current_committed - 1, &de); 729 read_exception(ps, ps->current_committed - 1, &ce);
726 *last_old_chunk = de.old_chunk; 730 *last_old_chunk = ce.old_chunk;
727 *last_new_chunk = de.new_chunk; 731 *last_new_chunk = ce.new_chunk;
728 732
729 /* 733 /*
730 * Find number of consecutive chunks within the current area, 734 * Find number of consecutive chunks within the current area,
@@ -733,9 +737,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
733 for (nr_consecutive = 1; nr_consecutive < ps->current_committed; 737 for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
734 nr_consecutive++) { 738 nr_consecutive++) {
735 read_exception(ps, ps->current_committed - 1 - nr_consecutive, 739 read_exception(ps, ps->current_committed - 1 - nr_consecutive,
736 &de); 740 &ce);
737 if (de.old_chunk != *last_old_chunk - nr_consecutive || 741 if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
738 de.new_chunk != *last_new_chunk - nr_consecutive) 742 ce.new_chunk != *last_new_chunk - nr_consecutive)
739 break; 743 break;
740 } 744 }
741 745
@@ -753,7 +757,7 @@ static int persistent_commit_merge(struct dm_exception_store *store,
753 for (i = 0; i < nr_merged; i++) 757 for (i = 0; i < nr_merged; i++)
754 clear_exception(ps, ps->current_committed - 1 - i); 758 clear_exception(ps, ps->current_committed - 1 - i);
755 759
756 r = area_io(ps, WRITE); 760 r = area_io(ps, WRITE_FLUSH_FUA);
757 if (r < 0) 761 if (r < 0)
758 return r; 762 return r;
759 763
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 9ecff5f3023..6f758870fc1 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -30,16 +30,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
30 ((ti)->type->name == dm_snapshot_merge_target_name) 30 ((ti)->type->name == dm_snapshot_merge_target_name)
31 31
32/* 32/*
33 * The percentage increment we will wake up users at
34 */
35#define WAKE_UP_PERCENT 5
36
37/*
38 * kcopyd priority of snapshot operations
39 */
40#define SNAPSHOT_COPY_PRIORITY 2
41
42/*
43 * The size of the mempool used to track chunks in use. 33 * The size of the mempool used to track chunks in use.
44 */ 34 */
45#define MIN_IOS 256 35#define MIN_IOS 256
@@ -180,6 +170,13 @@ struct dm_snap_pending_exception {
180 * kcopyd. 170 * kcopyd.
181 */ 171 */
182 int started; 172 int started;
173
174 /*
175 * For writing a complete chunk, bypassing the copy.
176 */
177 struct bio *full_bio;
178 bio_end_io_t *full_bio_end_io;
179 void *full_bio_private;
183}; 180};
184 181
185/* 182/*
@@ -1055,8 +1052,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1055 1052
1056 s = kmalloc(sizeof(*s), GFP_KERNEL); 1053 s = kmalloc(sizeof(*s), GFP_KERNEL);
1057 if (!s) { 1054 if (!s) {
1058 ti->error = "Cannot allocate snapshot context private " 1055 ti->error = "Cannot allocate private snapshot structure";
1059 "structure";
1060 r = -ENOMEM; 1056 r = -ENOMEM;
1061 goto bad; 1057 goto bad;
1062 } 1058 }
@@ -1380,6 +1376,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1380 struct dm_snapshot *s = pe->snap; 1376 struct dm_snapshot *s = pe->snap;
1381 struct bio *origin_bios = NULL; 1377 struct bio *origin_bios = NULL;
1382 struct bio *snapshot_bios = NULL; 1378 struct bio *snapshot_bios = NULL;
1379 struct bio *full_bio = NULL;
1383 int error = 0; 1380 int error = 0;
1384 1381
1385 if (!success) { 1382 if (!success) {
@@ -1415,10 +1412,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1415 */ 1412 */
1416 dm_insert_exception(&s->complete, e); 1413 dm_insert_exception(&s->complete, e);
1417 1414
1418 out: 1415out:
1419 dm_remove_exception(&pe->e); 1416 dm_remove_exception(&pe->e);
1420 snapshot_bios = bio_list_get(&pe->snapshot_bios); 1417 snapshot_bios = bio_list_get(&pe->snapshot_bios);
1421 origin_bios = bio_list_get(&pe->origin_bios); 1418 origin_bios = bio_list_get(&pe->origin_bios);
1419 full_bio = pe->full_bio;
1420 if (full_bio) {
1421 full_bio->bi_end_io = pe->full_bio_end_io;
1422 full_bio->bi_private = pe->full_bio_private;
1423 }
1422 free_pending_exception(pe); 1424 free_pending_exception(pe);
1423 1425
1424 increment_pending_exceptions_done_count(); 1426 increment_pending_exceptions_done_count();
@@ -1426,10 +1428,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1426 up_write(&s->lock); 1428 up_write(&s->lock);
1427 1429
1428 /* Submit any pending write bios */ 1430 /* Submit any pending write bios */
1429 if (error) 1431 if (error) {
1432 if (full_bio)
1433 bio_io_error(full_bio);
1430 error_bios(snapshot_bios); 1434 error_bios(snapshot_bios);
1431 else 1435 } else {
1436 if (full_bio)
1437 bio_endio(full_bio, 0);
1432 flush_bios(snapshot_bios); 1438 flush_bios(snapshot_bios);
1439 }
1433 1440
1434 retry_origin_bios(s, origin_bios); 1441 retry_origin_bios(s, origin_bios);
1435} 1442}
@@ -1480,8 +1487,33 @@ static void start_copy(struct dm_snap_pending_exception *pe)
1480 dest.count = src.count; 1487 dest.count = src.count;
1481 1488
1482 /* Hand over to kcopyd */ 1489 /* Hand over to kcopyd */
1483 dm_kcopyd_copy(s->kcopyd_client, 1490 dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
1484 &src, 1, &dest, 0, copy_callback, pe); 1491}
1492
1493static void full_bio_end_io(struct bio *bio, int error)
1494{
1495 void *callback_data = bio->bi_private;
1496
1497 dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0);
1498}
1499
1500static void start_full_bio(struct dm_snap_pending_exception *pe,
1501 struct bio *bio)
1502{
1503 struct dm_snapshot *s = pe->snap;
1504 void *callback_data;
1505
1506 pe->full_bio = bio;
1507 pe->full_bio_end_io = bio->bi_end_io;
1508 pe->full_bio_private = bio->bi_private;
1509
1510 callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
1511 copy_callback, pe);
1512
1513 bio->bi_end_io = full_bio_end_io;
1514 bio->bi_private = callback_data;
1515
1516 generic_make_request(bio);
1485} 1517}
1486 1518
1487static struct dm_snap_pending_exception * 1519static struct dm_snap_pending_exception *
@@ -1519,6 +1551,7 @@ __find_pending_exception(struct dm_snapshot *s,
1519 bio_list_init(&pe->origin_bios); 1551 bio_list_init(&pe->origin_bios);
1520 bio_list_init(&pe->snapshot_bios); 1552 bio_list_init(&pe->snapshot_bios);
1521 pe->started = 0; 1553 pe->started = 0;
1554 pe->full_bio = NULL;
1522 1555
1523 if (s->store->type->prepare_exception(s->store, &pe->e)) { 1556 if (s->store->type->prepare_exception(s->store, &pe->e)) {
1524 free_pending_exception(pe); 1557 free_pending_exception(pe);
@@ -1612,10 +1645,19 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1612 } 1645 }
1613 1646
1614 remap_exception(s, &pe->e, bio, chunk); 1647 remap_exception(s, &pe->e, bio, chunk);
1615 bio_list_add(&pe->snapshot_bios, bio);
1616 1648
1617 r = DM_MAPIO_SUBMITTED; 1649 r = DM_MAPIO_SUBMITTED;
1618 1650
1651 if (!pe->started &&
1652 bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
1653 pe->started = 1;
1654 up_write(&s->lock);
1655 start_full_bio(pe, bio);
1656 goto out;
1657 }
1658
1659 bio_list_add(&pe->snapshot_bios, bio);
1660
1619 if (!pe->started) { 1661 if (!pe->started) {
1620 /* this is protected by snap->lock */ 1662 /* this is protected by snap->lock */
1621 pe->started = 1; 1663 pe->started = 1;
@@ -1628,9 +1670,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1628 map_context->ptr = track_chunk(s, chunk); 1670 map_context->ptr = track_chunk(s, chunk);
1629 } 1671 }
1630 1672
1631 out_unlock: 1673out_unlock:
1632 up_write(&s->lock); 1674 up_write(&s->lock);
1633 out: 1675out:
1634 return r; 1676 return r;
1635} 1677}
1636 1678
@@ -1974,7 +2016,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
1974 pe_to_start_now = pe; 2016 pe_to_start_now = pe;
1975 } 2017 }
1976 2018
1977 next_snapshot: 2019next_snapshot:
1978 up_write(&snap->lock); 2020 up_write(&snap->lock);
1979 2021
1980 if (pe_to_start_now) { 2022 if (pe_to_start_now) {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index bfe9c2333ce..986b8754bb0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -54,7 +54,6 @@ struct dm_table {
54 sector_t *highs; 54 sector_t *highs;
55 struct dm_target *targets; 55 struct dm_target *targets;
56 56
57 unsigned discards_supported:1;
58 unsigned integrity_supported:1; 57 unsigned integrity_supported:1;
59 58
60 /* 59 /*
@@ -154,12 +153,11 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
154 return NULL; 153 return NULL;
155 154
156 size = nmemb * elem_size; 155 size = nmemb * elem_size;
157 addr = vmalloc(size); 156 addr = vzalloc(size);
158 if (addr)
159 memset(addr, 0, size);
160 157
161 return addr; 158 return addr;
162} 159}
160EXPORT_SYMBOL(dm_vcalloc);
163 161
164/* 162/*
165 * highs, and targets are managed as dynamic arrays during a 163 * highs, and targets are managed as dynamic arrays during a
@@ -209,7 +207,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
209 INIT_LIST_HEAD(&t->devices); 207 INIT_LIST_HEAD(&t->devices);
210 INIT_LIST_HEAD(&t->target_callbacks); 208 INIT_LIST_HEAD(&t->target_callbacks);
211 atomic_set(&t->holders, 0); 209 atomic_set(&t->holders, 0);
212 t->discards_supported = 1;
213 210
214 if (!num_targets) 211 if (!num_targets)
215 num_targets = KEYS_PER_NODE; 212 num_targets = KEYS_PER_NODE;
@@ -281,6 +278,7 @@ void dm_table_get(struct dm_table *t)
281{ 278{
282 atomic_inc(&t->holders); 279 atomic_inc(&t->holders);
283} 280}
281EXPORT_SYMBOL(dm_table_get);
284 282
285void dm_table_put(struct dm_table *t) 283void dm_table_put(struct dm_table *t)
286{ 284{
@@ -290,6 +288,7 @@ void dm_table_put(struct dm_table *t)
290 smp_mb__before_atomic_dec(); 288 smp_mb__before_atomic_dec();
291 atomic_dec(&t->holders); 289 atomic_dec(&t->holders);
292} 290}
291EXPORT_SYMBOL(dm_table_put);
293 292
294/* 293/*
295 * Checks to see if we need to extend highs or targets. 294 * Checks to see if we need to extend highs or targets.
@@ -455,13 +454,14 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
455 * Add a device to the list, or just increment the usage count if 454 * Add a device to the list, or just increment the usage count if
456 * it's already present. 455 * it's already present.
457 */ 456 */
458static int __table_get_device(struct dm_table *t, struct dm_target *ti, 457int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
459 const char *path, fmode_t mode, struct dm_dev **result) 458 struct dm_dev **result)
460{ 459{
461 int r; 460 int r;
462 dev_t uninitialized_var(dev); 461 dev_t uninitialized_var(dev);
463 struct dm_dev_internal *dd; 462 struct dm_dev_internal *dd;
464 unsigned int major, minor; 463 unsigned int major, minor;
464 struct dm_table *t = ti->table;
465 465
466 BUG_ON(!t); 466 BUG_ON(!t);
467 467
@@ -509,6 +509,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
509 *result = &dd->dm_dev; 509 *result = &dd->dm_dev;
510 return 0; 510 return 0;
511} 511}
512EXPORT_SYMBOL(dm_get_device);
512 513
513int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, 514int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
514 sector_t start, sector_t len, void *data) 515 sector_t start, sector_t len, void *data)
@@ -539,23 +540,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
539 * If not we'll force DM to use PAGE_SIZE or 540 * If not we'll force DM to use PAGE_SIZE or
540 * smaller I/O, just to be safe. 541 * smaller I/O, just to be safe.
541 */ 542 */
542 543 if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
543 if (q->merge_bvec_fn && !ti->type->merge)
544 blk_limits_max_hw_sectors(limits, 544 blk_limits_max_hw_sectors(limits,
545 (unsigned int) (PAGE_SIZE >> 9)); 545 (unsigned int) (PAGE_SIZE >> 9));
546 return 0; 546 return 0;
547} 547}
548EXPORT_SYMBOL_GPL(dm_set_device_limits); 548EXPORT_SYMBOL_GPL(dm_set_device_limits);
549 549
550int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
551 struct dm_dev **result)
552{
553 return __table_get_device(ti->table, ti, path, mode, result);
554}
555
556
557/* 550/*
558 * Decrement a devices use count and remove it if necessary. 551 * Decrement a device's use count and remove it if necessary.
559 */ 552 */
560void dm_put_device(struct dm_target *ti, struct dm_dev *d) 553void dm_put_device(struct dm_target *ti, struct dm_dev *d)
561{ 554{
@@ -568,6 +561,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
568 kfree(dd); 561 kfree(dd);
569 } 562 }
570} 563}
564EXPORT_SYMBOL(dm_put_device);
571 565
572/* 566/*
573 * Checks to see if the target joins onto the end of the table. 567 * Checks to see if the target joins onto the end of the table.
@@ -791,8 +785,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
791 785
792 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; 786 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
793 787
794 if (!tgt->num_discard_requests) 788 if (!tgt->num_discard_requests && tgt->discards_supported)
795 t->discards_supported = 0; 789 DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.",
790 dm_device_name(t->md), type);
796 791
797 return 0; 792 return 0;
798 793
@@ -802,6 +797,63 @@ int dm_table_add_target(struct dm_table *t, const char *type,
802 return r; 797 return r;
803} 798}
804 799
800/*
801 * Target argument parsing helpers.
802 */
803static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
804 unsigned *value, char **error, unsigned grouped)
805{
806 const char *arg_str = dm_shift_arg(arg_set);
807
808 if (!arg_str ||
809 (sscanf(arg_str, "%u", value) != 1) ||
810 (*value < arg->min) ||
811 (*value > arg->max) ||
812 (grouped && arg_set->argc < *value)) {
813 *error = arg->error;
814 return -EINVAL;
815 }
816
817 return 0;
818}
819
820int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
821 unsigned *value, char **error)
822{
823 return validate_next_arg(arg, arg_set, value, error, 0);
824}
825EXPORT_SYMBOL(dm_read_arg);
826
827int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
828 unsigned *value, char **error)
829{
830 return validate_next_arg(arg, arg_set, value, error, 1);
831}
832EXPORT_SYMBOL(dm_read_arg_group);
833
834const char *dm_shift_arg(struct dm_arg_set *as)
835{
836 char *r;
837
838 if (as->argc) {
839 as->argc--;
840 r = *as->argv;
841 as->argv++;
842 return r;
843 }
844
845 return NULL;
846}
847EXPORT_SYMBOL(dm_shift_arg);
848
849void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
850{
851 BUG_ON(as->argc < num_args);
852 as->argc -= num_args;
853 as->argv += num_args;
854}
855EXPORT_SYMBOL(dm_consume_args);
856
805static int dm_table_set_type(struct dm_table *t) 857static int dm_table_set_type(struct dm_table *t)
806{ 858{
807 unsigned i; 859 unsigned i;
@@ -1077,11 +1129,13 @@ void dm_table_event(struct dm_table *t)
1077 t->event_fn(t->event_context); 1129 t->event_fn(t->event_context);
1078 mutex_unlock(&_event_lock); 1130 mutex_unlock(&_event_lock);
1079} 1131}
1132EXPORT_SYMBOL(dm_table_event);
1080 1133
1081sector_t dm_table_get_size(struct dm_table *t) 1134sector_t dm_table_get_size(struct dm_table *t)
1082{ 1135{
1083 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; 1136 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
1084} 1137}
1138EXPORT_SYMBOL(dm_table_get_size);
1085 1139
1086struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) 1140struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
1087{ 1141{
@@ -1194,9 +1248,45 @@ static void dm_table_set_integrity(struct dm_table *t)
1194 blk_get_integrity(template_disk)); 1248 blk_get_integrity(template_disk));
1195} 1249}
1196 1250
1251static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
1252 sector_t start, sector_t len, void *data)
1253{
1254 unsigned flush = (*(unsigned *)data);
1255 struct request_queue *q = bdev_get_queue(dev->bdev);
1256
1257 return q && (q->flush_flags & flush);
1258}
1259
1260static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
1261{
1262 struct dm_target *ti;
1263 unsigned i = 0;
1264
1265 /*
1266 * Require at least one underlying device to support flushes.
1267 * t->devices includes internal dm devices such as mirror logs
1268 * so we need to use iterate_devices here, which targets
1269 * supporting flushes must provide.
1270 */
1271 while (i < dm_table_get_num_targets(t)) {
1272 ti = dm_table_get_target(t, i++);
1273
1274 if (!ti->num_flush_requests)
1275 continue;
1276
1277 if (ti->type->iterate_devices &&
1278 ti->type->iterate_devices(ti, device_flush_capable, &flush))
1279 return 1;
1280 }
1281
1282 return 0;
1283}
1284
1197void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 1285void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1198 struct queue_limits *limits) 1286 struct queue_limits *limits)
1199{ 1287{
1288 unsigned flush = 0;
1289
1200 /* 1290 /*
1201 * Copy table's limits to the DM device's request_queue 1291 * Copy table's limits to the DM device's request_queue
1202 */ 1292 */
@@ -1207,6 +1297,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1207 else 1297 else
1208 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 1298 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
1209 1299
1300 if (dm_table_supports_flush(t, REQ_FLUSH)) {
1301 flush |= REQ_FLUSH;
1302 if (dm_table_supports_flush(t, REQ_FUA))
1303 flush |= REQ_FUA;
1304 }
1305 blk_queue_flush(q, flush);
1306
1210 dm_table_set_integrity(t); 1307 dm_table_set_integrity(t);
1211 1308
1212 /* 1309 /*
@@ -1237,6 +1334,7 @@ fmode_t dm_table_get_mode(struct dm_table *t)
1237{ 1334{
1238 return t->mode; 1335 return t->mode;
1239} 1336}
1337EXPORT_SYMBOL(dm_table_get_mode);
1240 1338
1241static void suspend_targets(struct dm_table *t, unsigned postsuspend) 1339static void suspend_targets(struct dm_table *t, unsigned postsuspend)
1242{ 1340{
@@ -1345,6 +1443,7 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
1345{ 1443{
1346 return t->md; 1444 return t->md;
1347} 1445}
1446EXPORT_SYMBOL(dm_table_get_md);
1348 1447
1349static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, 1448static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
1350 sector_t start, sector_t len, void *data) 1449 sector_t start, sector_t len, void *data)
@@ -1359,19 +1458,19 @@ bool dm_table_supports_discards(struct dm_table *t)
1359 struct dm_target *ti; 1458 struct dm_target *ti;
1360 unsigned i = 0; 1459 unsigned i = 0;
1361 1460
1362 if (!t->discards_supported)
1363 return 0;
1364
1365 /* 1461 /*
1366 * Unless any target used by the table set discards_supported, 1462 * Unless any target used by the table set discards_supported,
1367 * require at least one underlying device to support discards. 1463 * require at least one underlying device to support discards.
1368 * t->devices includes internal dm devices such as mirror logs 1464 * t->devices includes internal dm devices such as mirror logs
1369 * so we need to use iterate_devices here, which targets 1465 * so we need to use iterate_devices here, which targets
1370 * supporting discard must provide. 1466 * supporting discard selectively must provide.
1371 */ 1467 */
1372 while (i < dm_table_get_num_targets(t)) { 1468 while (i < dm_table_get_num_targets(t)) {
1373 ti = dm_table_get_target(t, i++); 1469 ti = dm_table_get_target(t, i++);
1374 1470
1471 if (!ti->num_discard_requests)
1472 continue;
1473
1375 if (ti->discards_supported) 1474 if (ti->discards_supported)
1376 return 1; 1475 return 1;
1377 1476
@@ -1382,13 +1481,3 @@ bool dm_table_supports_discards(struct dm_table *t)
1382 1481
1383 return 0; 1482 return 0;
1384} 1483}
1385
1386EXPORT_SYMBOL(dm_vcalloc);
1387EXPORT_SYMBOL(dm_get_device);
1388EXPORT_SYMBOL(dm_put_device);
1389EXPORT_SYMBOL(dm_table_event);
1390EXPORT_SYMBOL(dm_table_get_size);
1391EXPORT_SYMBOL(dm_table_get_mode);
1392EXPORT_SYMBOL(dm_table_get_md);
1393EXPORT_SYMBOL(dm_table_put);
1394EXPORT_SYMBOL(dm_table_get);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0cf68b47887..52b39f335bb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -37,6 +37,8 @@ static const char *_name = DM_NAME;
37static unsigned int major = 0; 37static unsigned int major = 0;
38static unsigned int _major = 0; 38static unsigned int _major = 0;
39 39
40static DEFINE_IDR(_minor_idr);
41
40static DEFINE_SPINLOCK(_minor_lock); 42static DEFINE_SPINLOCK(_minor_lock);
41/* 43/*
42 * For bio-based dm. 44 * For bio-based dm.
@@ -109,6 +111,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
109#define DMF_FREEING 3 111#define DMF_FREEING 3
110#define DMF_DELETING 4 112#define DMF_DELETING 4
111#define DMF_NOFLUSH_SUSPENDING 5 113#define DMF_NOFLUSH_SUSPENDING 5
114#define DMF_MERGE_IS_OPTIONAL 6
112 115
113/* 116/*
114 * Work processed by per-device workqueue. 117 * Work processed by per-device workqueue.
@@ -313,6 +316,12 @@ static void __exit dm_exit(void)
313 316
314 while (i--) 317 while (i--)
315 _exits[i](); 318 _exits[i]();
319
320 /*
321 * Should be empty by this point.
322 */
323 idr_remove_all(&_minor_idr);
324 idr_destroy(&_minor_idr);
316} 325}
317 326
318/* 327/*
@@ -1171,7 +1180,8 @@ static int __clone_and_map_discard(struct clone_info *ci)
1171 1180
1172 /* 1181 /*
1173 * Even though the device advertised discard support, 1182 * Even though the device advertised discard support,
1174 * reconfiguration might have changed that since the 1183 * that does not mean every target supports it, and
1184 * reconfiguration might also have changed that since the
1175 * check was performed. 1185 * check was performed.
1176 */ 1186 */
1177 if (!ti->num_discard_requests) 1187 if (!ti->num_discard_requests)
@@ -1705,8 +1715,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
1705/*----------------------------------------------------------------- 1715/*-----------------------------------------------------------------
1706 * An IDR is used to keep track of allocated minor numbers. 1716 * An IDR is used to keep track of allocated minor numbers.
1707 *---------------------------------------------------------------*/ 1717 *---------------------------------------------------------------*/
1708static DEFINE_IDR(_minor_idr);
1709
1710static void free_minor(int minor) 1718static void free_minor(int minor)
1711{ 1719{
1712 spin_lock(&_minor_lock); 1720 spin_lock(&_minor_lock);
@@ -1800,7 +1808,6 @@ static void dm_init_md_queue(struct mapped_device *md)
1800 blk_queue_make_request(md->queue, dm_request); 1808 blk_queue_make_request(md->queue, dm_request);
1801 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1809 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1802 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1810 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1803 blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1804} 1811}
1805 1812
1806/* 1813/*
@@ -1986,6 +1993,59 @@ static void __set_size(struct mapped_device *md, sector_t size)
1986} 1993}
1987 1994
1988/* 1995/*
1996 * Return 1 if the queue has a compulsory merge_bvec_fn function.
1997 *
1998 * If this function returns 0, then the device is either a non-dm
1999 * device without a merge_bvec_fn, or it is a dm device that is
2000 * able to split any bios it receives that are too big.
2001 */
2002int dm_queue_merge_is_compulsory(struct request_queue *q)
2003{
2004 struct mapped_device *dev_md;
2005
2006 if (!q->merge_bvec_fn)
2007 return 0;
2008
2009 if (q->make_request_fn == dm_request) {
2010 dev_md = q->queuedata;
2011 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
2012 return 0;
2013 }
2014
2015 return 1;
2016}
2017
2018static int dm_device_merge_is_compulsory(struct dm_target *ti,
2019 struct dm_dev *dev, sector_t start,
2020 sector_t len, void *data)
2021{
2022 struct block_device *bdev = dev->bdev;
2023 struct request_queue *q = bdev_get_queue(bdev);
2024
2025 return dm_queue_merge_is_compulsory(q);
2026}
2027
2028/*
2029 * Return 1 if it is acceptable to ignore merge_bvec_fn based
2030 * on the properties of the underlying devices.
2031 */
2032static int dm_table_merge_is_optional(struct dm_table *table)
2033{
2034 unsigned i = 0;
2035 struct dm_target *ti;
2036
2037 while (i < dm_table_get_num_targets(table)) {
2038 ti = dm_table_get_target(table, i++);
2039
2040 if (ti->type->iterate_devices &&
2041 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
2042 return 0;
2043 }
2044
2045 return 1;
2046}
2047
2048/*
1989 * Returns old map, which caller must destroy. 2049 * Returns old map, which caller must destroy.
1990 */ 2050 */
1991static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2051static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
@@ -1995,6 +2055,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1995 struct request_queue *q = md->queue; 2055 struct request_queue *q = md->queue;
1996 sector_t size; 2056 sector_t size;
1997 unsigned long flags; 2057 unsigned long flags;
2058 int merge_is_optional;
1998 2059
1999 size = dm_table_get_size(t); 2060 size = dm_table_get_size(t);
2000 2061
@@ -2020,10 +2081,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2020 2081
2021 __bind_mempools(md, t); 2082 __bind_mempools(md, t);
2022 2083
2084 merge_is_optional = dm_table_merge_is_optional(t);
2085
2023 write_lock_irqsave(&md->map_lock, flags); 2086 write_lock_irqsave(&md->map_lock, flags);
2024 old_map = md->map; 2087 old_map = md->map;
2025 md->map = t; 2088 md->map = t;
2026 dm_table_set_restrictions(t, q, limits); 2089 dm_table_set_restrictions(t, q, limits);
2090 if (merge_is_optional)
2091 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2092 else
2093 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2027 write_unlock_irqrestore(&md->map_lock, flags); 2094 write_unlock_irqrestore(&md->map_lock, flags);
2028 2095
2029 return old_map; 2096 return old_map;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1aaf16746da..6745dbd278a 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -66,6 +66,8 @@ int dm_table_alloc_md_mempools(struct dm_table *t);
66void dm_table_free_md_mempools(struct dm_table *t); 66void dm_table_free_md_mempools(struct dm_table *t);
67struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); 67struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
68 68
69int dm_queue_merge_is_compulsory(struct request_queue *q);
70
69void dm_lock_md_type(struct mapped_device *md); 71void dm_lock_md_type(struct mapped_device *md);
70void dm_unlock_md_type(struct mapped_device *md); 72void dm_unlock_md_type(struct mapped_device *md);
71void dm_set_md_type(struct mapped_device *md, unsigned type); 73void dm_set_md_type(struct mapped_device *md, unsigned type);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 4427e045405..3fa1f3d90ce 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -208,6 +208,49 @@ struct dm_target_callbacks {
208int dm_register_target(struct target_type *t); 208int dm_register_target(struct target_type *t);
209void dm_unregister_target(struct target_type *t); 209void dm_unregister_target(struct target_type *t);
210 210
211/*
212 * Target argument parsing.
213 */
214struct dm_arg_set {
215 unsigned argc;
216 char **argv;
217};
218
219/*
220 * The minimum and maximum value of a numeric argument, together with
221 * the error message to use if the number is found to be outside that range.
222 */
223struct dm_arg {
224 unsigned min;
225 unsigned max;
226 char *error;
227};
228
229/*
230 * Validate the next argument, either returning it as *value or, if invalid,
231 * returning -EINVAL and setting *error.
232 */
233int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
234 unsigned *value, char **error);
235
236/*
237 * Process the next argument as the start of a group containing between
238 * arg->min and arg->max further arguments. Either return the size as
239 * *num_args or, if invalid, return -EINVAL and set *error.
240 */
241int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
242 unsigned *num_args, char **error);
243
244/*
245 * Return the current argument and shift to the next.
246 */
247const char *dm_shift_arg(struct dm_arg_set *as);
248
249/*
250 * Move through num_args arguments.
251 */
252void dm_consume_args(struct dm_arg_set *as, unsigned num_args);
253
211/*----------------------------------------------------------------- 254/*-----------------------------------------------------------------
212 * Functions for creating and manipulating mapped devices. 255 * Functions for creating and manipulating mapped devices.
213 * Drop the reference with dm_put when you finish with the object. 256 * Drop the reference with dm_put when you finish with the object.
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 3708455ee6c..0cb8eff76bd 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
267#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 267#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
268 268
269#define DM_VERSION_MAJOR 4 269#define DM_VERSION_MAJOR 4
270#define DM_VERSION_MINOR 20 270#define DM_VERSION_MINOR 21
271#define DM_VERSION_PATCHLEVEL 0 271#define DM_VERSION_PATCHLEVEL 0
272#define DM_VERSION_EXTRA "-ioctl (2011-02-02)" 272#define DM_VERSION_EXTRA "-ioctl (2011-07-06)"
273 273
274/* Status bits */ 274/* Status bits */
275#define DM_READONLY_FLAG (1 << 0) /* In/Out */ 275#define DM_READONLY_FLAG (1 << 0) /* In/Out */
diff --git a/include/linux/dm-kcopyd.h b/include/linux/dm-kcopyd.h
index 298d587e349..5e54458e920 100644
--- a/include/linux/dm-kcopyd.h
+++ b/include/linux/dm-kcopyd.h
@@ -42,5 +42,20 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
42 unsigned num_dests, struct dm_io_region *dests, 42 unsigned num_dests, struct dm_io_region *dests,
43 unsigned flags, dm_kcopyd_notify_fn fn, void *context); 43 unsigned flags, dm_kcopyd_notify_fn fn, void *context);
44 44
45/*
46 * Prepare a callback and submit it via the kcopyd thread.
47 *
48 * dm_kcopyd_prepare_callback allocates a callback structure and returns it.
49 * It must not be called from interrupt context.
50 * The returned value should be passed into dm_kcopyd_do_callback.
51 *
52 * dm_kcopyd_do_callback submits the callback.
53 * It may be called from interrupt context.
54 * The callback is issued from the kcopyd thread.
55 */
56void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
57 dm_kcopyd_notify_fn fn, void *context);
58void dm_kcopyd_do_callback(void *job, int read_err, unsigned long write_err);
59
45#endif /* __KERNEL__ */ 60#endif /* __KERNEL__ */
46#endif /* _LINUX_DM_KCOPYD_H */ 61#endif /* _LINUX_DM_KCOPYD_H */