aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/device-mapper/dm-crypt.txt7
-rw-r--r--Documentation/device-mapper/dm-raid.txt70
-rw-r--r--drivers/md/Kconfig24
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-crypt.c618
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-ioctl.c111
-rw-r--r--drivers/md/dm-kcopyd.c57
-rw-r--r--drivers/md/dm-log-userspace-base.c139
-rw-r--r--drivers/md/dm-log-userspace-transfer.c1
-rw-r--r--drivers/md/dm-log.c2
-rw-r--r--drivers/md/dm-mpath.c67
-rw-r--r--drivers/md/dm-raid.c697
-rw-r--r--drivers/md/dm-raid1.c19
-rw-r--r--drivers/md/dm-snap-persistent.c4
-rw-r--r--drivers/md/dm-snap.c62
-rw-r--r--drivers/md/dm-stripe.c27
-rw-r--r--drivers/md/dm-table.c19
-rw-r--r--drivers/md/dm.c23
-rw-r--r--include/linux/device-mapper.h12
-rw-r--r--include/linux/dm-ioctl.h14
-rw-r--r--include/linux/dm-log-userspace.h13
22 files changed, 1691 insertions, 298 deletions
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 524de926290d..59293ac4a5d0 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -8,7 +8,7 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
8 8
9<cipher> 9<cipher>
10 Encryption cipher and an optional IV generation mode. 10 Encryption cipher and an optional IV generation mode.
11 (In format cipher-chainmode-ivopts:ivmode). 11 (In format cipher[:keycount]-chainmode-ivopts:ivmode).
12 Examples: 12 Examples:
13 des 13 des
14 aes-cbc-essiv:sha256 14 aes-cbc-essiv:sha256
@@ -20,6 +20,11 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
20 Key used for encryption. It is encoded as a hexadecimal number. 20 Key used for encryption. It is encoded as a hexadecimal number.
21 You can only use key sizes that are valid for the selected cipher. 21 You can only use key sizes that are valid for the selected cipher.
22 22
23<keycount>
24 Multi-key compatibility mode. You can define <keycount> keys and
25 then sectors are encrypted according to their offsets (sector 0 uses key0;
26 sector 1 uses key1 etc.). <keycount> must be a power of two.
27
23<iv_offset> 28<iv_offset>
24 The IV offset is a sector count that is added to the sector number 29 The IV offset is a sector count that is added to the sector number
25 before creating the IV. 30 before creating the IV.
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
new file mode 100644
index 000000000000..33b6b7071ac8
--- /dev/null
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -0,0 +1,70 @@
1Device-mapper RAID (dm-raid) is a bridge from DM to MD. It
2provides a way to use device-mapper interfaces to access the MD RAID
3drivers.
4
5As with all device-mapper targets, the nominal public interfaces are the
6constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO
7and STATUSTYPE_TABLE). The CTR table looks like the following:
8
91: <s> <l> raid \
102: <raid_type> <#raid_params> <raid_params> \
113: <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
12
13Line 1 contains the standard first three arguments to any device-mapper
14target - the start, length, and target type fields. The target type in
15this case is "raid".
16
17Line 2 contains the arguments that define the particular raid
18type/personality/level, the required arguments for that raid type, and
19any optional arguments. Possible raid types include: raid4, raid5_la,
20raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc. (raid1 is
21planned for the future.) The list of required and optional parameters
22is the same for all the current raid types. The required parameters are
23positional, while the optional parameters are given as key/value pairs.
24The possible parameters are as follows:
25 <chunk_size> Chunk size in sectors.
26 [[no]sync] Force/Prevent RAID initialization
27 [rebuild <idx>] Rebuild the drive indicated by the index
28 [daemon_sleep <ms>] Time between bitmap daemon work to clear bits
29 [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
30 [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
31 [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
32 [stripe_cache <sectors>] Stripe cache size for higher RAIDs
33
34Line 3 contains the list of devices that compose the array in
35metadata/data device pairs. If the metadata is stored separately, a '-'
36is given for the metadata device position. If a drive has failed or is
37missing at creation time, a '-' can be given for both the metadata and
38data drives for a given position.
39
40NB. Currently all metadata devices must be specified as '-'.
41
42Examples:
43# RAID4 - 4 data drives, 1 parity
44# No metadata devices specified to hold superblock/bitmap info
45# Chunk size of 1MiB
46# (Lines separated for easy reading)
470 1960893648 raid \
48 raid4 1 2048 \
49 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
50
51# RAID4 - 4 data drives, 1 parity (no metadata devices)
52# Chunk size of 1MiB, force RAID initialization,
53# min recovery rate at 20 kiB/sec/disk
540 1960893648 raid \
55 raid4 4 2048 min_recovery_rate 20 sync\
56 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
57
58Performing a 'dmsetup table' should display the CTR table used to
59construct the mapping (with possible reordering of optional
60parameters).
61
62Performing a 'dmsetup status' will yield information on the state and
63health of the array. The output is as follows:
641: <s> <l> raid \
652: <raid_type> <#devices> <1 health char for each dev> <resync_ratio>
66
67Line 1 is standard DM output. Line 2 is best shown by example:
68 0 1960893648 raid raid4 5 AAAAA 2/490221568
69Here we can see the RAID type is raid4, there are 5 devices - all of
70which are 'A'live, and the array is 2/490221568 complete with recovery.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf1a95e31559..98d9ec85e0eb 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,30 @@ config DM_MIRROR
240 Allow volume managers to mirror logical volumes, also 240 Allow volume managers to mirror logical volumes, also
241 needed for live data migration tools such as 'pvmove'. 241 needed for live data migration tools such as 'pvmove'.
242 242
243config DM_RAID
244 tristate "RAID 4/5/6 target (EXPERIMENTAL)"
245 depends on BLK_DEV_DM && EXPERIMENTAL
246 select MD_RAID456
247 select BLK_DEV_MD
248 ---help---
249 A dm target that supports RAID4, RAID5 and RAID6 mappings
250
251 A RAID-5 set of N drives with a capacity of C MB per drive provides
252 the capacity of C * (N - 1) MB, and protects against a failure
253 of a single drive. For a given sector (row) number, (N - 1) drives
254 contain data sectors, and one drive contains the parity protection.
255 For a RAID-4 set, the parity blocks are present on a single drive,
256 while a RAID-5 set distributes the parity across the drives in one
257 of the available parity distribution methods.
258
259 A RAID-6 set of N drives with a capacity of C MB per drive
260 provides the capacity of C * (N - 2) MB, and protects
261 against a failure of any two drives. For a given sector
262 (row) number, (N - 2) drives contain data sectors, and two
263 drives contains two independent redundancy syndromes. Like
264 RAID-5, RAID-6 distributes the syndromes across the drives
265 in one of the available parity distribution methods.
266
243config DM_LOG_USERSPACE 267config DM_LOG_USERSPACE
244 tristate "Mirror userspace logging (EXPERIMENTAL)" 268 tristate "Mirror userspace logging (EXPERIMENTAL)"
245 depends on DM_MIRROR && EXPERIMENTAL && NET 269 depends on DM_MIRROR && EXPERIMENTAL && NET
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5e3aac41919d..d0138606c2e8 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
36obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o 36obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
37obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o 37obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
38obj-$(CONFIG_DM_ZERO) += dm-zero.o 38obj-$(CONFIG_DM_ZERO) += dm-zero.o
39obj-$(CONFIG_DM_RAID) += dm-raid.o
39 40
40ifeq ($(CONFIG_DM_UEVENT),y) 41ifeq ($(CONFIG_DM_UEVENT),y)
41dm-mod-objs += dm-uevent.o 42dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d5b0e4c0e702..4e054bd91664 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,10 +18,14 @@
18#include <linux/crypto.h> 18#include <linux/crypto.h>
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/backing-dev.h> 20#include <linux/backing-dev.h>
21#include <linux/percpu.h>
21#include <asm/atomic.h> 22#include <asm/atomic.h>
22#include <linux/scatterlist.h> 23#include <linux/scatterlist.h>
23#include <asm/page.h> 24#include <asm/page.h>
24#include <asm/unaligned.h> 25#include <asm/unaligned.h>
26#include <crypto/hash.h>
27#include <crypto/md5.h>
28#include <crypto/algapi.h>
25 29
26#include <linux/device-mapper.h> 30#include <linux/device-mapper.h>
27 31
@@ -63,6 +67,7 @@ struct dm_crypt_request {
63 struct convert_context *ctx; 67 struct convert_context *ctx;
64 struct scatterlist sg_in; 68 struct scatterlist sg_in;
65 struct scatterlist sg_out; 69 struct scatterlist sg_out;
70 sector_t iv_sector;
66}; 71};
67 72
68struct crypt_config; 73struct crypt_config;
@@ -73,11 +78,13 @@ struct crypt_iv_operations {
73 void (*dtr)(struct crypt_config *cc); 78 void (*dtr)(struct crypt_config *cc);
74 int (*init)(struct crypt_config *cc); 79 int (*init)(struct crypt_config *cc);
75 int (*wipe)(struct crypt_config *cc); 80 int (*wipe)(struct crypt_config *cc);
76 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); 81 int (*generator)(struct crypt_config *cc, u8 *iv,
82 struct dm_crypt_request *dmreq);
83 int (*post)(struct crypt_config *cc, u8 *iv,
84 struct dm_crypt_request *dmreq);
77}; 85};
78 86
79struct iv_essiv_private { 87struct iv_essiv_private {
80 struct crypto_cipher *tfm;
81 struct crypto_hash *hash_tfm; 88 struct crypto_hash *hash_tfm;
82 u8 *salt; 89 u8 *salt;
83}; 90};
@@ -86,11 +93,32 @@ struct iv_benbi_private {
86 int shift; 93 int shift;
87}; 94};
88 95
96#define LMK_SEED_SIZE 64 /* hash + 0 */
97struct iv_lmk_private {
98 struct crypto_shash *hash_tfm;
99 u8 *seed;
100};
101
89/* 102/*
90 * Crypt: maps a linear range of a block device 103 * Crypt: maps a linear range of a block device
91 * and encrypts / decrypts at the same time. 104 * and encrypts / decrypts at the same time.
92 */ 105 */
93enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; 106enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
107
108/*
109 * Duplicated per-CPU state for cipher.
110 */
111struct crypt_cpu {
112 struct ablkcipher_request *req;
113 /* ESSIV: struct crypto_cipher *essiv_tfm */
114 void *iv_private;
115 struct crypto_ablkcipher *tfms[0];
116};
117
118/*
119 * The fields in here must be read only after initialization,
120 * changing state should be in crypt_cpu.
121 */
94struct crypt_config { 122struct crypt_config {
95 struct dm_dev *dev; 123 struct dm_dev *dev;
96 sector_t start; 124 sector_t start;
@@ -108,17 +136,25 @@ struct crypt_config {
108 struct workqueue_struct *crypt_queue; 136 struct workqueue_struct *crypt_queue;
109 137
110 char *cipher; 138 char *cipher;
111 char *cipher_mode; 139 char *cipher_string;
112 140
113 struct crypt_iv_operations *iv_gen_ops; 141 struct crypt_iv_operations *iv_gen_ops;
114 union { 142 union {
115 struct iv_essiv_private essiv; 143 struct iv_essiv_private essiv;
116 struct iv_benbi_private benbi; 144 struct iv_benbi_private benbi;
145 struct iv_lmk_private lmk;
117 } iv_gen_private; 146 } iv_gen_private;
118 sector_t iv_offset; 147 sector_t iv_offset;
119 unsigned int iv_size; 148 unsigned int iv_size;
120 149
121 /* 150 /*
151 * Duplicated per cpu state. Access through
152 * per_cpu_ptr() only.
153 */
154 struct crypt_cpu __percpu *cpu;
155 unsigned tfms_count;
156
157 /*
122 * Layout of each crypto request: 158 * Layout of each crypto request:
123 * 159 *
124 * struct ablkcipher_request 160 * struct ablkcipher_request
@@ -132,11 +168,10 @@ struct crypt_config {
132 * correctly aligned. 168 * correctly aligned.
133 */ 169 */
134 unsigned int dmreq_start; 170 unsigned int dmreq_start;
135 struct ablkcipher_request *req;
136 171
137 struct crypto_ablkcipher *tfm;
138 unsigned long flags; 172 unsigned long flags;
139 unsigned int key_size; 173 unsigned int key_size;
174 unsigned int key_parts;
140 u8 key[0]; 175 u8 key[0];
141}; 176};
142 177
@@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool;
148 183
149static void clone_init(struct dm_crypt_io *, struct bio *); 184static void clone_init(struct dm_crypt_io *, struct bio *);
150static void kcryptd_queue_crypt(struct dm_crypt_io *io); 185static void kcryptd_queue_crypt(struct dm_crypt_io *io);
186static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
187
188static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
189{
190 return this_cpu_ptr(cc->cpu);
191}
192
193/*
194 * Use this to access cipher attributes that are the same for each CPU.
195 */
196static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
197{
198 return __this_cpu_ptr(cc->cpu)->tfms[0];
199}
151 200
152/* 201/*
153 * Different IV generation algorithms: 202 * Different IV generation algorithms:
@@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
168 * null: the initial vector is always zero. Provides compatibility with 217 * null: the initial vector is always zero. Provides compatibility with
169 * obsolete loop_fish2 devices. Do not use for new devices. 218 * obsolete loop_fish2 devices. Do not use for new devices.
170 * 219 *
220 * lmk: Compatible implementation of the block chaining mode used
221 * by the Loop-AES block device encryption system
222 * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
223 * It operates on full 512 byte sectors and uses CBC
224 * with an IV derived from the sector number, the data and
225 * optionally extra IV seed.
226 * This means that after decryption the first block
227 * of sector must be tweaked according to decrypted data.
228 * Loop-AES can use three encryption schemes:
229 * version 1: is plain aes-cbc mode
230 * version 2: uses 64 multikey scheme with lmk IV generator
231 * version 3: the same as version 2 with additional IV seed
232 * (it uses 65 keys, last key is used as IV seed)
233 *
171 * plumb: unimplemented, see: 234 * plumb: unimplemented, see:
172 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 235 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
173 */ 236 */
174 237
175static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 238static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
239 struct dm_crypt_request *dmreq)
176{ 240{
177 memset(iv, 0, cc->iv_size); 241 memset(iv, 0, cc->iv_size);
178 *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); 242 *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
179 243
180 return 0; 244 return 0;
181} 245}
182 246
183static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, 247static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
184 sector_t sector) 248 struct dm_crypt_request *dmreq)
185{ 249{
186 memset(iv, 0, cc->iv_size); 250 memset(iv, 0, cc->iv_size);
187 *(u64 *)iv = cpu_to_le64(sector); 251 *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
188 252
189 return 0; 253 return 0;
190} 254}
@@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
195 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 259 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
196 struct hash_desc desc; 260 struct hash_desc desc;
197 struct scatterlist sg; 261 struct scatterlist sg;
198 int err; 262 struct crypto_cipher *essiv_tfm;
263 int err, cpu;
199 264
200 sg_init_one(&sg, cc->key, cc->key_size); 265 sg_init_one(&sg, cc->key, cc->key_size);
201 desc.tfm = essiv->hash_tfm; 266 desc.tfm = essiv->hash_tfm;
@@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
205 if (err) 270 if (err)
206 return err; 271 return err;
207 272
208 return crypto_cipher_setkey(essiv->tfm, essiv->salt, 273 for_each_possible_cpu(cpu) {
274 essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
275
276 err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
209 crypto_hash_digestsize(essiv->hash_tfm)); 277 crypto_hash_digestsize(essiv->hash_tfm));
278 if (err)
279 return err;
280 }
281
282 return 0;
210} 283}
211 284
212/* Wipe salt and reset key derived from volume key */ 285/* Wipe salt and reset key derived from volume key */
@@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
214{ 287{
215 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 288 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
216 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); 289 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
290 struct crypto_cipher *essiv_tfm;
291 int cpu, r, err = 0;
217 292
218 memset(essiv->salt, 0, salt_size); 293 memset(essiv->salt, 0, salt_size);
219 294
220 return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); 295 for_each_possible_cpu(cpu) {
296 essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
297 r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
298 if (r)
299 err = r;
300 }
301
302 return err;
303}
304
305/* Set up per cpu cipher state */
306static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
307 struct dm_target *ti,
308 u8 *salt, unsigned saltsize)
309{
310 struct crypto_cipher *essiv_tfm;
311 int err;
312
313 /* Setup the essiv_tfm with the given salt */
314 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
315 if (IS_ERR(essiv_tfm)) {
316 ti->error = "Error allocating crypto tfm for ESSIV";
317 return essiv_tfm;
318 }
319
320 if (crypto_cipher_blocksize(essiv_tfm) !=
321 crypto_ablkcipher_ivsize(any_tfm(cc))) {
322 ti->error = "Block size of ESSIV cipher does "
323 "not match IV size of block cipher";
324 crypto_free_cipher(essiv_tfm);
325 return ERR_PTR(-EINVAL);
326 }
327
328 err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
329 if (err) {
330 ti->error = "Failed to set key for ESSIV cipher";
331 crypto_free_cipher(essiv_tfm);
332 return ERR_PTR(err);
333 }
334
335 return essiv_tfm;
221} 336}
222 337
223static void crypt_iv_essiv_dtr(struct crypt_config *cc) 338static void crypt_iv_essiv_dtr(struct crypt_config *cc)
224{ 339{
340 int cpu;
341 struct crypt_cpu *cpu_cc;
342 struct crypto_cipher *essiv_tfm;
225 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 343 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
226 344
227 crypto_free_cipher(essiv->tfm);
228 essiv->tfm = NULL;
229
230 crypto_free_hash(essiv->hash_tfm); 345 crypto_free_hash(essiv->hash_tfm);
231 essiv->hash_tfm = NULL; 346 essiv->hash_tfm = NULL;
232 347
233 kzfree(essiv->salt); 348 kzfree(essiv->salt);
234 essiv->salt = NULL; 349 essiv->salt = NULL;
350
351 for_each_possible_cpu(cpu) {
352 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
353 essiv_tfm = cpu_cc->iv_private;
354
355 if (essiv_tfm)
356 crypto_free_cipher(essiv_tfm);
357
358 cpu_cc->iv_private = NULL;
359 }
235} 360}
236 361
237static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, 362static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
240 struct crypto_cipher *essiv_tfm = NULL; 365 struct crypto_cipher *essiv_tfm = NULL;
241 struct crypto_hash *hash_tfm = NULL; 366 struct crypto_hash *hash_tfm = NULL;
242 u8 *salt = NULL; 367 u8 *salt = NULL;
243 int err; 368 int err, cpu;
244 369
245 if (!opts) { 370 if (!opts) {
246 ti->error = "Digest algorithm missing for ESSIV mode"; 371 ti->error = "Digest algorithm missing for ESSIV mode";
@@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
262 goto bad; 387 goto bad;
263 } 388 }
264 389
265 /* Allocate essiv_tfm */
266 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
267 if (IS_ERR(essiv_tfm)) {
268 ti->error = "Error allocating crypto tfm for ESSIV";
269 err = PTR_ERR(essiv_tfm);
270 goto bad;
271 }
272 if (crypto_cipher_blocksize(essiv_tfm) !=
273 crypto_ablkcipher_ivsize(cc->tfm)) {
274 ti->error = "Block size of ESSIV cipher does "
275 "not match IV size of block cipher";
276 err = -EINVAL;
277 goto bad;
278 }
279
280 cc->iv_gen_private.essiv.salt = salt; 390 cc->iv_gen_private.essiv.salt = salt;
281 cc->iv_gen_private.essiv.tfm = essiv_tfm;
282 cc->iv_gen_private.essiv.hash_tfm = hash_tfm; 391 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
283 392
393 for_each_possible_cpu(cpu) {
394 essiv_tfm = setup_essiv_cpu(cc, ti, salt,
395 crypto_hash_digestsize(hash_tfm));
396 if (IS_ERR(essiv_tfm)) {
397 crypt_iv_essiv_dtr(cc);
398 return PTR_ERR(essiv_tfm);
399 }
400 per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
401 }
402
284 return 0; 403 return 0;
285 404
286bad: 405bad:
287 if (essiv_tfm && !IS_ERR(essiv_tfm))
288 crypto_free_cipher(essiv_tfm);
289 if (hash_tfm && !IS_ERR(hash_tfm)) 406 if (hash_tfm && !IS_ERR(hash_tfm))
290 crypto_free_hash(hash_tfm); 407 crypto_free_hash(hash_tfm);
291 kfree(salt); 408 kfree(salt);
292 return err; 409 return err;
293} 410}
294 411
295static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 412static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
413 struct dm_crypt_request *dmreq)
296{ 414{
415 struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
416
297 memset(iv, 0, cc->iv_size); 417 memset(iv, 0, cc->iv_size);
298 *(u64 *)iv = cpu_to_le64(sector); 418 *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
299 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); 419 crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
420
300 return 0; 421 return 0;
301} 422}
302 423
303static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, 424static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
304 const char *opts) 425 const char *opts)
305{ 426{
306 unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); 427 unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
307 int log = ilog2(bs); 428 int log = ilog2(bs);
308 429
309 /* we need to calculate how far we must shift the sector count 430 /* we need to calculate how far we must shift the sector count
@@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
328{ 449{
329} 450}
330 451
331static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 452static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
453 struct dm_crypt_request *dmreq)
332{ 454{
333 __be64 val; 455 __be64 val;
334 456
335 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ 457 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
336 458
337 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); 459 val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
338 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); 460 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
339 461
340 return 0; 462 return 0;
341} 463}
342 464
343static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 465static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
466 struct dm_crypt_request *dmreq)
344{ 467{
345 memset(iv, 0, cc->iv_size); 468 memset(iv, 0, cc->iv_size);
346 469
347 return 0; 470 return 0;
348} 471}
349 472
473static void crypt_iv_lmk_dtr(struct crypt_config *cc)
474{
475 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
476
477 if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
478 crypto_free_shash(lmk->hash_tfm);
479 lmk->hash_tfm = NULL;
480
481 kzfree(lmk->seed);
482 lmk->seed = NULL;
483}
484
485static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
486 const char *opts)
487{
488 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
489
490 lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
491 if (IS_ERR(lmk->hash_tfm)) {
492 ti->error = "Error initializing LMK hash";
493 return PTR_ERR(lmk->hash_tfm);
494 }
495
496 /* No seed in LMK version 2 */
497 if (cc->key_parts == cc->tfms_count) {
498 lmk->seed = NULL;
499 return 0;
500 }
501
502 lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
503 if (!lmk->seed) {
504 crypt_iv_lmk_dtr(cc);
505 ti->error = "Error kmallocing seed storage in LMK";
506 return -ENOMEM;
507 }
508
509 return 0;
510}
511
512static int crypt_iv_lmk_init(struct crypt_config *cc)
513{
514 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
515 int subkey_size = cc->key_size / cc->key_parts;
516
517 /* LMK seed is on the position of LMK_KEYS + 1 key */
518 if (lmk->seed)
519 memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
520 crypto_shash_digestsize(lmk->hash_tfm));
521
522 return 0;
523}
524
525static int crypt_iv_lmk_wipe(struct crypt_config *cc)
526{
527 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
528
529 if (lmk->seed)
530 memset(lmk->seed, 0, LMK_SEED_SIZE);
531
532 return 0;
533}
534
535static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
536 struct dm_crypt_request *dmreq,
537 u8 *data)
538{
539 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
540 struct {
541 struct shash_desc desc;
542 char ctx[crypto_shash_descsize(lmk->hash_tfm)];
543 } sdesc;
544 struct md5_state md5state;
545 u32 buf[4];
546 int i, r;
547
548 sdesc.desc.tfm = lmk->hash_tfm;
549 sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
550
551 r = crypto_shash_init(&sdesc.desc);
552 if (r)
553 return r;
554
555 if (lmk->seed) {
556 r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
557 if (r)
558 return r;
559 }
560
561 /* Sector is always 512B, block size 16, add data of blocks 1-31 */
562 r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
563 if (r)
564 return r;
565
566 /* Sector is cropped to 56 bits here */
567 buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
568 buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
569 buf[2] = cpu_to_le32(4024);
570 buf[3] = 0;
571 r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
572 if (r)
573 return r;
574
575 /* No MD5 padding here */
576 r = crypto_shash_export(&sdesc.desc, &md5state);
577 if (r)
578 return r;
579
580 for (i = 0; i < MD5_HASH_WORDS; i++)
581 __cpu_to_le32s(&md5state.hash[i]);
582 memcpy(iv, &md5state.hash, cc->iv_size);
583
584 return 0;
585}
586
587static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
588 struct dm_crypt_request *dmreq)
589{
590 u8 *src;
591 int r = 0;
592
593 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
594 src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
595 r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
596 kunmap_atomic(src, KM_USER0);
597 } else
598 memset(iv, 0, cc->iv_size);
599
600 return r;
601}
602
603static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
604 struct dm_crypt_request *dmreq)
605{
606 u8 *dst;
607 int r;
608
609 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
610 return 0;
611
612 dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
613 r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
614
615 /* Tweak the first block of plaintext sector */
616 if (!r)
617 crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
618
619 kunmap_atomic(dst, KM_USER0);
620 return r;
621}
622
350static struct crypt_iv_operations crypt_iv_plain_ops = { 623static struct crypt_iv_operations crypt_iv_plain_ops = {
351 .generator = crypt_iv_plain_gen 624 .generator = crypt_iv_plain_gen
352}; 625};
@@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
373 .generator = crypt_iv_null_gen 646 .generator = crypt_iv_null_gen
374}; 647};
375 648
649static struct crypt_iv_operations crypt_iv_lmk_ops = {
650 .ctr = crypt_iv_lmk_ctr,
651 .dtr = crypt_iv_lmk_dtr,
652 .init = crypt_iv_lmk_init,
653 .wipe = crypt_iv_lmk_wipe,
654 .generator = crypt_iv_lmk_gen,
655 .post = crypt_iv_lmk_post
656};
657
376static void crypt_convert_init(struct crypt_config *cc, 658static void crypt_convert_init(struct crypt_config *cc,
377 struct convert_context *ctx, 659 struct convert_context *ctx,
378 struct bio *bio_out, struct bio *bio_in, 660 struct bio *bio_out, struct bio *bio_in,
@@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
400 return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); 682 return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
401} 683}
402 684
685static u8 *iv_of_dmreq(struct crypt_config *cc,
686 struct dm_crypt_request *dmreq)
687{
688 return (u8 *)ALIGN((unsigned long)(dmreq + 1),
689 crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
690}
691
403static int crypt_convert_block(struct crypt_config *cc, 692static int crypt_convert_block(struct crypt_config *cc,
404 struct convert_context *ctx, 693 struct convert_context *ctx,
405 struct ablkcipher_request *req) 694 struct ablkcipher_request *req)
@@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc,
411 int r = 0; 700 int r = 0;
412 701
413 dmreq = dmreq_of_req(cc, req); 702 dmreq = dmreq_of_req(cc, req);
414 iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), 703 iv = iv_of_dmreq(cc, dmreq);
415 crypto_ablkcipher_alignmask(cc->tfm) + 1);
416 704
705 dmreq->iv_sector = ctx->sector;
417 dmreq->ctx = ctx; 706 dmreq->ctx = ctx;
418 sg_init_table(&dmreq->sg_in, 1); 707 sg_init_table(&dmreq->sg_in, 1);
419 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, 708 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc,
436 } 725 }
437 726
438 if (cc->iv_gen_ops) { 727 if (cc->iv_gen_ops) {
439 r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); 728 r = cc->iv_gen_ops->generator(cc, iv, dmreq);
440 if (r < 0) 729 if (r < 0)
441 return r; 730 return r;
442 } 731 }
@@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc,
449 else 738 else
450 r = crypto_ablkcipher_decrypt(req); 739 r = crypto_ablkcipher_decrypt(req);
451 740
741 if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
742 r = cc->iv_gen_ops->post(cc, iv, dmreq);
743
452 return r; 744 return r;
453} 745}
454 746
455static void kcryptd_async_done(struct crypto_async_request *async_req, 747static void kcryptd_async_done(struct crypto_async_request *async_req,
456 int error); 748 int error);
749
457static void crypt_alloc_req(struct crypt_config *cc, 750static void crypt_alloc_req(struct crypt_config *cc,
458 struct convert_context *ctx) 751 struct convert_context *ctx)
459{ 752{
460 if (!cc->req) 753 struct crypt_cpu *this_cc = this_crypt_config(cc);
461 cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); 754 unsigned key_index = ctx->sector & (cc->tfms_count - 1);
462 ablkcipher_request_set_tfm(cc->req, cc->tfm); 755
463 ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | 756 if (!this_cc->req)
464 CRYPTO_TFM_REQ_MAY_SLEEP, 757 this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
465 kcryptd_async_done, 758
466 dmreq_of_req(cc, cc->req)); 759 ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]);
760 ablkcipher_request_set_callback(this_cc->req,
761 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
762 kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
467} 763}
468 764
469/* 765/*
@@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
472static int crypt_convert(struct crypt_config *cc, 768static int crypt_convert(struct crypt_config *cc,
473 struct convert_context *ctx) 769 struct convert_context *ctx)
474{ 770{
771 struct crypt_cpu *this_cc = this_crypt_config(cc);
475 int r; 772 int r;
476 773
477 atomic_set(&ctx->pending, 1); 774 atomic_set(&ctx->pending, 1);
@@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc,
483 780
484 atomic_inc(&ctx->pending); 781 atomic_inc(&ctx->pending);
485 782
486 r = crypt_convert_block(cc, ctx, cc->req); 783 r = crypt_convert_block(cc, ctx, this_cc->req);
487 784
488 switch (r) { 785 switch (r) {
489 /* async */ 786 /* async */
@@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc,
492 INIT_COMPLETION(ctx->restart); 789 INIT_COMPLETION(ctx->restart);
493 /* fall through*/ 790 /* fall through*/
494 case -EINPROGRESS: 791 case -EINPROGRESS:
495 cc->req = NULL; 792 this_cc->req = NULL;
496 ctx->sector++; 793 ctx->sector++;
497 continue; 794 continue;
498 795
@@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
651 * They must be separated as otherwise the final stages could be 948 * They must be separated as otherwise the final stages could be
652 * starved by new requests which can block in the first stages due 949 * starved by new requests which can block in the first stages due
653 * to memory allocation. 950 * to memory allocation.
951 *
952 * The work is done per CPU global for all dm-crypt instances.
953 * They should not depend on each other and do not block.
654 */ 954 */
655static void crypt_endio(struct bio *clone, int error) 955static void crypt_endio(struct bio *clone, int error)
656{ 956{
@@ -691,26 +991,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
691 clone->bi_destructor = dm_crypt_bio_destructor; 991 clone->bi_destructor = dm_crypt_bio_destructor;
692} 992}
693 993
694static void kcryptd_io_read(struct dm_crypt_io *io) 994static void kcryptd_unplug(struct crypt_config *cc)
995{
996 blk_unplug(bdev_get_queue(cc->dev->bdev));
997}
998
999static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
695{ 1000{
696 struct crypt_config *cc = io->target->private; 1001 struct crypt_config *cc = io->target->private;
697 struct bio *base_bio = io->base_bio; 1002 struct bio *base_bio = io->base_bio;
698 struct bio *clone; 1003 struct bio *clone;
699 1004
700 crypt_inc_pending(io);
701
702 /* 1005 /*
703 * The block layer might modify the bvec array, so always 1006 * The block layer might modify the bvec array, so always
704 * copy the required bvecs because we need the original 1007 * copy the required bvecs because we need the original
705 * one in order to decrypt the whole bio data *afterwards*. 1008 * one in order to decrypt the whole bio data *afterwards*.
706 */ 1009 */
707 clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); 1010 clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
708 if (unlikely(!clone)) { 1011 if (!clone) {
709 io->error = -ENOMEM; 1012 kcryptd_unplug(cc);
710 crypt_dec_pending(io); 1013 return 1;
711 return;
712 } 1014 }
713 1015
1016 crypt_inc_pending(io);
1017
714 clone_init(io, clone); 1018 clone_init(io, clone);
715 clone->bi_idx = 0; 1019 clone->bi_idx = 0;
716 clone->bi_vcnt = bio_segments(base_bio); 1020 clone->bi_vcnt = bio_segments(base_bio);
@@ -720,6 +1024,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
720 sizeof(struct bio_vec) * clone->bi_vcnt); 1024 sizeof(struct bio_vec) * clone->bi_vcnt);
721 1025
722 generic_make_request(clone); 1026 generic_make_request(clone);
1027 return 0;
723} 1028}
724 1029
725static void kcryptd_io_write(struct dm_crypt_io *io) 1030static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -732,9 +1037,12 @@ static void kcryptd_io(struct work_struct *work)
732{ 1037{
733 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); 1038 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
734 1039
735 if (bio_data_dir(io->base_bio) == READ) 1040 if (bio_data_dir(io->base_bio) == READ) {
736 kcryptd_io_read(io); 1041 crypt_inc_pending(io);
737 else 1042 if (kcryptd_io_read(io, GFP_NOIO))
1043 io->error = -ENOMEM;
1044 crypt_dec_pending(io);
1045 } else
738 kcryptd_io_write(io); 1046 kcryptd_io_write(io);
739} 1047}
740 1048
@@ -901,6 +1209,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
901 return; 1209 return;
902 } 1210 }
903 1211
1212 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
1213 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
1214
904 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); 1215 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
905 1216
906 if (!atomic_dec_and_test(&ctx->pending)) 1217 if (!atomic_dec_and_test(&ctx->pending))
@@ -971,34 +1282,84 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
971 } 1282 }
972} 1283}
973 1284
974static int crypt_set_key(struct crypt_config *cc, char *key) 1285static void crypt_free_tfms(struct crypt_config *cc, int cpu)
975{ 1286{
976 unsigned key_size = strlen(key) >> 1; 1287 struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1288 unsigned i;
977 1289
978 if (cc->key_size && cc->key_size != key_size) 1290 for (i = 0; i < cc->tfms_count; i++)
1291 if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
1292 crypto_free_ablkcipher(cpu_cc->tfms[i]);
1293 cpu_cc->tfms[i] = NULL;
1294 }
1295}
1296
1297static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
1298{
1299 struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1300 unsigned i;
1301 int err;
1302
1303 for (i = 0; i < cc->tfms_count; i++) {
1304 cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
1305 if (IS_ERR(cpu_cc->tfms[i])) {
1306 err = PTR_ERR(cpu_cc->tfms[i]);
1307 crypt_free_tfms(cc, cpu);
1308 return err;
1309 }
1310 }
1311
1312 return 0;
1313}
1314
1315static int crypt_setkey_allcpus(struct crypt_config *cc)
1316{
1317 unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
1318 int cpu, err = 0, i, r;
1319
1320 for_each_possible_cpu(cpu) {
1321 for (i = 0; i < cc->tfms_count; i++) {
1322 r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i],
1323 cc->key + (i * subkey_size), subkey_size);
1324 if (r)
1325 err = r;
1326 }
1327 }
1328
1329 return err;
1330}
1331
1332static int crypt_set_key(struct crypt_config *cc, char *key)
1333{
1334 /* The key size may not be changed. */
1335 if (cc->key_size != (strlen(key) >> 1))
979 return -EINVAL; 1336 return -EINVAL;
980 1337
981 cc->key_size = key_size; /* initial settings */ 1338 /* Hyphen (which gives a key_size of zero) means there is no key. */
1339 if (!cc->key_size && strcmp(key, "-"))
1340 return -EINVAL;
982 1341
983 if ((!key_size && strcmp(key, "-")) || 1342 if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
984 (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
985 return -EINVAL; 1343 return -EINVAL;
986 1344
987 set_bit(DM_CRYPT_KEY_VALID, &cc->flags); 1345 set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
988 1346
989 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); 1347 return crypt_setkey_allcpus(cc);
990} 1348}
991 1349
992static int crypt_wipe_key(struct crypt_config *cc) 1350static int crypt_wipe_key(struct crypt_config *cc)
993{ 1351{
994 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); 1352 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
995 memset(&cc->key, 0, cc->key_size * sizeof(u8)); 1353 memset(&cc->key, 0, cc->key_size * sizeof(u8));
996 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); 1354
1355 return crypt_setkey_allcpus(cc);
997} 1356}
998 1357
999static void crypt_dtr(struct dm_target *ti) 1358static void crypt_dtr(struct dm_target *ti)
1000{ 1359{
1001 struct crypt_config *cc = ti->private; 1360 struct crypt_config *cc = ti->private;
1361 struct crypt_cpu *cpu_cc;
1362 int cpu;
1002 1363
1003 ti->private = NULL; 1364 ti->private = NULL;
1004 1365
@@ -1010,6 +1371,14 @@ static void crypt_dtr(struct dm_target *ti)
1010 if (cc->crypt_queue) 1371 if (cc->crypt_queue)
1011 destroy_workqueue(cc->crypt_queue); 1372 destroy_workqueue(cc->crypt_queue);
1012 1373
1374 if (cc->cpu)
1375 for_each_possible_cpu(cpu) {
1376 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1377 if (cpu_cc->req)
1378 mempool_free(cpu_cc->req, cc->req_pool);
1379 crypt_free_tfms(cc, cpu);
1380 }
1381
1013 if (cc->bs) 1382 if (cc->bs)
1014 bioset_free(cc->bs); 1383 bioset_free(cc->bs);
1015 1384
@@ -1023,14 +1392,14 @@ static void crypt_dtr(struct dm_target *ti)
1023 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 1392 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
1024 cc->iv_gen_ops->dtr(cc); 1393 cc->iv_gen_ops->dtr(cc);
1025 1394
1026 if (cc->tfm && !IS_ERR(cc->tfm))
1027 crypto_free_ablkcipher(cc->tfm);
1028
1029 if (cc->dev) 1395 if (cc->dev)
1030 dm_put_device(ti, cc->dev); 1396 dm_put_device(ti, cc->dev);
1031 1397
1398 if (cc->cpu)
1399 free_percpu(cc->cpu);
1400
1032 kzfree(cc->cipher); 1401 kzfree(cc->cipher);
1033 kzfree(cc->cipher_mode); 1402 kzfree(cc->cipher_string);
1034 1403
1035 /* Must zero key material before freeing */ 1404 /* Must zero key material before freeing */
1036 kzfree(cc); 1405 kzfree(cc);
@@ -1040,9 +1409,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1040 char *cipher_in, char *key) 1409 char *cipher_in, char *key)
1041{ 1410{
1042 struct crypt_config *cc = ti->private; 1411 struct crypt_config *cc = ti->private;
1043 char *tmp, *cipher, *chainmode, *ivmode, *ivopts; 1412 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
1044 char *cipher_api = NULL; 1413 char *cipher_api = NULL;
1045 int ret = -EINVAL; 1414 int cpu, ret = -EINVAL;
1046 1415
1047 /* Convert to crypto api definition? */ 1416 /* Convert to crypto api definition? */
1048 if (strchr(cipher_in, '(')) { 1417 if (strchr(cipher_in, '(')) {
@@ -1050,23 +1419,31 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1050 return -EINVAL; 1419 return -EINVAL;
1051 } 1420 }
1052 1421
1422 cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
1423 if (!cc->cipher_string)
1424 goto bad_mem;
1425
1053 /* 1426 /*
1054 * Legacy dm-crypt cipher specification 1427 * Legacy dm-crypt cipher specification
1055 * cipher-mode-iv:ivopts 1428 * cipher[:keycount]-mode-iv:ivopts
1056 */ 1429 */
1057 tmp = cipher_in; 1430 tmp = cipher_in;
1058 cipher = strsep(&tmp, "-"); 1431 keycount = strsep(&tmp, "-");
1432 cipher = strsep(&keycount, ":");
1433
1434 if (!keycount)
1435 cc->tfms_count = 1;
1436 else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
1437 !is_power_of_2(cc->tfms_count)) {
1438 ti->error = "Bad cipher key count specification";
1439 return -EINVAL;
1440 }
1441 cc->key_parts = cc->tfms_count;
1059 1442
1060 cc->cipher = kstrdup(cipher, GFP_KERNEL); 1443 cc->cipher = kstrdup(cipher, GFP_KERNEL);
1061 if (!cc->cipher) 1444 if (!cc->cipher)
1062 goto bad_mem; 1445 goto bad_mem;
1063 1446
1064 if (tmp) {
1065 cc->cipher_mode = kstrdup(tmp, GFP_KERNEL);
1066 if (!cc->cipher_mode)
1067 goto bad_mem;
1068 }
1069
1070 chainmode = strsep(&tmp, "-"); 1447 chainmode = strsep(&tmp, "-");
1071 ivopts = strsep(&tmp, "-"); 1448 ivopts = strsep(&tmp, "-");
1072 ivmode = strsep(&ivopts, ":"); 1449 ivmode = strsep(&ivopts, ":");
@@ -1074,10 +1451,19 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1074 if (tmp) 1451 if (tmp)
1075 DMWARN("Ignoring unexpected additional cipher options"); 1452 DMWARN("Ignoring unexpected additional cipher options");
1076 1453
1077 /* Compatibility mode for old dm-crypt mappings */ 1454 cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) +
1455 cc->tfms_count * sizeof(*(cc->cpu->tfms)),
1456 __alignof__(struct crypt_cpu));
1457 if (!cc->cpu) {
1458 ti->error = "Cannot allocate per cpu state";
1459 goto bad_mem;
1460 }
1461
1462 /*
1463 * For compatibility with the original dm-crypt mapping format, if
1464 * only the cipher name is supplied, use cbc-plain.
1465 */
1078 if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { 1466 if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
1079 kfree(cc->cipher_mode);
1080 cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL);
1081 chainmode = "cbc"; 1467 chainmode = "cbc";
1082 ivmode = "plain"; 1468 ivmode = "plain";
1083 } 1469 }
@@ -1099,11 +1485,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1099 } 1485 }
1100 1486
1101 /* Allocate cipher */ 1487 /* Allocate cipher */
1102 cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); 1488 for_each_possible_cpu(cpu) {
1103 if (IS_ERR(cc->tfm)) { 1489 ret = crypt_alloc_tfms(cc, cpu, cipher_api);
1104 ret = PTR_ERR(cc->tfm); 1490 if (ret < 0) {
1105 ti->error = "Error allocating crypto tfm"; 1491 ti->error = "Error allocating crypto tfm";
1106 goto bad; 1492 goto bad;
1493 }
1107 } 1494 }
1108 1495
1109 /* Initialize and set key */ 1496 /* Initialize and set key */
@@ -1114,7 +1501,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1114 } 1501 }
1115 1502
1116 /* Initialize IV */ 1503 /* Initialize IV */
1117 cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); 1504 cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
1118 if (cc->iv_size) 1505 if (cc->iv_size)
1119 /* at least a 64 bit sector number should fit in our buffer */ 1506 /* at least a 64 bit sector number should fit in our buffer */
1120 cc->iv_size = max(cc->iv_size, 1507 cc->iv_size = max(cc->iv_size,
@@ -1137,7 +1524,15 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1137 cc->iv_gen_ops = &crypt_iv_benbi_ops; 1524 cc->iv_gen_ops = &crypt_iv_benbi_ops;
1138 else if (strcmp(ivmode, "null") == 0) 1525 else if (strcmp(ivmode, "null") == 0)
1139 cc->iv_gen_ops = &crypt_iv_null_ops; 1526 cc->iv_gen_ops = &crypt_iv_null_ops;
1140 else { 1527 else if (strcmp(ivmode, "lmk") == 0) {
1528 cc->iv_gen_ops = &crypt_iv_lmk_ops;
1529 /* Version 2 and 3 is recognised according
1530 * to length of provided multi-key string.
1531 * If present (version 3), last key is used as IV seed.
1532 */
1533 if (cc->key_size % cc->key_parts)
1534 cc->key_parts++;
1535 } else {
1141 ret = -EINVAL; 1536 ret = -EINVAL;
1142 ti->error = "Invalid IV mode"; 1537 ti->error = "Invalid IV mode";
1143 goto bad; 1538 goto bad;
@@ -1194,6 +1589,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1194 ti->error = "Cannot allocate encryption context"; 1589 ti->error = "Cannot allocate encryption context";
1195 return -ENOMEM; 1590 return -ENOMEM;
1196 } 1591 }
1592 cc->key_size = key_size;
1197 1593
1198 ti->private = cc; 1594 ti->private = cc;
1199 ret = crypt_ctr_cipher(ti, argv[0], argv[1]); 1595 ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
@@ -1208,9 +1604,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1208 } 1604 }
1209 1605
1210 cc->dmreq_start = sizeof(struct ablkcipher_request); 1606 cc->dmreq_start = sizeof(struct ablkcipher_request);
1211 cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm); 1607 cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
1212 cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); 1608 cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
1213 cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) & 1609 cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
1214 ~(crypto_tfm_ctx_alignment() - 1); 1610 ~(crypto_tfm_ctx_alignment() - 1);
1215 1611
1216 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + 1612 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1219,7 +1615,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1219 ti->error = "Cannot allocate crypt request mempool"; 1615 ti->error = "Cannot allocate crypt request mempool";
1220 goto bad; 1616 goto bad;
1221 } 1617 }
1222 cc->req = NULL;
1223 1618
1224 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); 1619 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
1225 if (!cc->page_pool) { 1620 if (!cc->page_pool) {
@@ -1252,13 +1647,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1252 cc->start = tmpll; 1647 cc->start = tmpll;
1253 1648
1254 ret = -ENOMEM; 1649 ret = -ENOMEM;
1255 cc->io_queue = create_singlethread_workqueue("kcryptd_io"); 1650 cc->io_queue = alloc_workqueue("kcryptd_io",
1651 WQ_NON_REENTRANT|
1652 WQ_MEM_RECLAIM,
1653 1);
1256 if (!cc->io_queue) { 1654 if (!cc->io_queue) {
1257 ti->error = "Couldn't create kcryptd io queue"; 1655 ti->error = "Couldn't create kcryptd io queue";
1258 goto bad; 1656 goto bad;
1259 } 1657 }
1260 1658
1261 cc->crypt_queue = create_singlethread_workqueue("kcryptd"); 1659 cc->crypt_queue = alloc_workqueue("kcryptd",
1660 WQ_NON_REENTRANT|
1661 WQ_CPU_INTENSIVE|
1662 WQ_MEM_RECLAIM,
1663 1);
1262 if (!cc->crypt_queue) { 1664 if (!cc->crypt_queue) {
1263 ti->error = "Couldn't create kcryptd queue"; 1665 ti->error = "Couldn't create kcryptd queue";
1264 goto bad; 1666 goto bad;
@@ -1286,9 +1688,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1286 1688
1287 io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); 1689 io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
1288 1690
1289 if (bio_data_dir(io->base_bio) == READ) 1691 if (bio_data_dir(io->base_bio) == READ) {
1290 kcryptd_queue_io(io); 1692 if (kcryptd_io_read(io, GFP_NOWAIT))
1291 else 1693 kcryptd_queue_io(io);
1694 } else
1292 kcryptd_queue_crypt(io); 1695 kcryptd_queue_crypt(io);
1293 1696
1294 return DM_MAPIO_SUBMITTED; 1697 return DM_MAPIO_SUBMITTED;
@@ -1306,10 +1709,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
1306 break; 1709 break;
1307 1710
1308 case STATUSTYPE_TABLE: 1711 case STATUSTYPE_TABLE:
1309 if (cc->cipher_mode) 1712 DMEMIT("%s ", cc->cipher_string);
1310 DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode);
1311 else
1312 DMEMIT("%s ", cc->cipher);
1313 1713
1314 if (cc->key_size > 0) { 1714 if (cc->key_size > 0) {
1315 if ((maxlen - sz) < ((cc->key_size << 1) + 1)) 1715 if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -1421,7 +1821,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
1421 1821
1422static struct target_type crypt_target = { 1822static struct target_type crypt_target = {
1423 .name = "crypt", 1823 .name = "crypt",
1424 .version = {1, 7, 0}, 1824 .version = {1, 10, 0},
1425 .module = THIS_MODULE, 1825 .module = THIS_MODULE,
1426 .ctr = crypt_ctr, 1826 .ctr = crypt_ctr,
1427 .dtr = crypt_dtr, 1827 .dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index baa11912cc94..f18375dcedd9 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -352,7 +352,7 @@ static int __init dm_delay_init(void)
352{ 352{
353 int r = -ENOMEM; 353 int r = -ENOMEM;
354 354
355 kdelayd_wq = create_workqueue("kdelayd"); 355 kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
356 if (!kdelayd_wq) { 356 if (!kdelayd_wq) {
357 DMERR("Couldn't start kdelayd"); 357 DMERR("Couldn't start kdelayd");
358 goto bad_queue; 358 goto bad_queue;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4b54618b4159..6d12775a1061 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -295,19 +295,55 @@ retry:
295 DMWARN("remove_all left %d open device(s)", dev_skipped); 295 DMWARN("remove_all left %d open device(s)", dev_skipped);
296} 296}
297 297
298/*
299 * Set the uuid of a hash_cell that isn't already set.
300 */
301static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
302{
303 mutex_lock(&dm_hash_cells_mutex);
304 hc->uuid = new_uuid;
305 mutex_unlock(&dm_hash_cells_mutex);
306
307 list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
308}
309
310/*
311 * Changes the name of a hash_cell and returns the old name for
312 * the caller to free.
313 */
314static char *__change_cell_name(struct hash_cell *hc, char *new_name)
315{
316 char *old_name;
317
318 /*
319 * Rename and move the name cell.
320 */
321 list_del(&hc->name_list);
322 old_name = hc->name;
323
324 mutex_lock(&dm_hash_cells_mutex);
325 hc->name = new_name;
326 mutex_unlock(&dm_hash_cells_mutex);
327
328 list_add(&hc->name_list, _name_buckets + hash_str(new_name));
329
330 return old_name;
331}
332
298static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, 333static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
299 const char *new) 334 const char *new)
300{ 335{
301 char *new_name, *old_name; 336 char *new_data, *old_name = NULL;
302 struct hash_cell *hc; 337 struct hash_cell *hc;
303 struct dm_table *table; 338 struct dm_table *table;
304 struct mapped_device *md; 339 struct mapped_device *md;
340 unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
305 341
306 /* 342 /*
307 * duplicate new. 343 * duplicate new.
308 */ 344 */
309 new_name = kstrdup(new, GFP_KERNEL); 345 new_data = kstrdup(new, GFP_KERNEL);
310 if (!new_name) 346 if (!new_data)
311 return ERR_PTR(-ENOMEM); 347 return ERR_PTR(-ENOMEM);
312 348
313 down_write(&_hash_lock); 349 down_write(&_hash_lock);
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
315 /* 351 /*
316 * Is new free ? 352 * Is new free ?
317 */ 353 */
318 hc = __get_name_cell(new); 354 if (change_uuid)
355 hc = __get_uuid_cell(new);
356 else
357 hc = __get_name_cell(new);
358
319 if (hc) { 359 if (hc) {
320 DMWARN("asked to rename to an already-existing name %s -> %s", 360 DMWARN("Unable to change %s on mapped device %s to one that "
361 "already exists: %s",
362 change_uuid ? "uuid" : "name",
321 param->name, new); 363 param->name, new);
322 dm_put(hc->md); 364 dm_put(hc->md);
323 up_write(&_hash_lock); 365 up_write(&_hash_lock);
324 kfree(new_name); 366 kfree(new_data);
325 return ERR_PTR(-EBUSY); 367 return ERR_PTR(-EBUSY);
326 } 368 }
327 369
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
330 */ 372 */
331 hc = __get_name_cell(param->name); 373 hc = __get_name_cell(param->name);
332 if (!hc) { 374 if (!hc) {
333 DMWARN("asked to rename a non-existent device %s -> %s", 375 DMWARN("Unable to rename non-existent device, %s to %s%s",
334 param->name, new); 376 param->name, change_uuid ? "uuid " : "", new);
335 up_write(&_hash_lock); 377 up_write(&_hash_lock);
336 kfree(new_name); 378 kfree(new_data);
337 return ERR_PTR(-ENXIO); 379 return ERR_PTR(-ENXIO);
338 } 380 }
339 381
340 /* 382 /*
341 * rename and move the name cell. 383 * Does this device already have a uuid?
342 */ 384 */
343 list_del(&hc->name_list); 385 if (change_uuid && hc->uuid) {
344 old_name = hc->name; 386 DMWARN("Unable to change uuid of mapped device %s to %s "
345 mutex_lock(&dm_hash_cells_mutex); 387 "because uuid is already set to %s",
346 hc->name = new_name; 388 param->name, new, hc->uuid);
347 mutex_unlock(&dm_hash_cells_mutex); 389 dm_put(hc->md);
348 list_add(&hc->name_list, _name_buckets + hash_str(new_name)); 390 up_write(&_hash_lock);
391 kfree(new_data);
392 return ERR_PTR(-EINVAL);
393 }
394
395 if (change_uuid)
396 __set_cell_uuid(hc, new_data);
397 else
398 old_name = __change_cell_name(hc, new_data);
349 399
350 /* 400 /*
351 * Wake up any dm event waiters. 401 * Wake up any dm event waiters.
@@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
729 hc = __find_device_hash_cell(param); 779 hc = __find_device_hash_cell(param);
730 780
731 if (!hc) { 781 if (!hc) {
732 DMWARN("device doesn't appear to be in the dev hash table."); 782 DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
733 up_write(&_hash_lock); 783 up_write(&_hash_lock);
734 return -ENXIO; 784 return -ENXIO;
735 } 785 }
@@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
741 */ 791 */
742 r = dm_lock_for_deletion(md); 792 r = dm_lock_for_deletion(md);
743 if (r) { 793 if (r) {
744 DMWARN("unable to remove open device %s", hc->name); 794 DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
745 up_write(&_hash_lock); 795 up_write(&_hash_lock);
746 dm_put(md); 796 dm_put(md);
747 return r; 797 return r;
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end)
774static int dev_rename(struct dm_ioctl *param, size_t param_size) 824static int dev_rename(struct dm_ioctl *param, size_t param_size)
775{ 825{
776 int r; 826 int r;
777 char *new_name = (char *) param + param->data_start; 827 char *new_data = (char *) param + param->data_start;
778 struct mapped_device *md; 828 struct mapped_device *md;
829 unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
779 830
780 if (new_name < param->data || 831 if (new_data < param->data ||
781 invalid_str(new_name, (void *) param + param_size) || 832 invalid_str(new_data, (void *) param + param_size) ||
782 strlen(new_name) > DM_NAME_LEN - 1) { 833 strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
783 DMWARN("Invalid new logical volume name supplied."); 834 DMWARN("Invalid new mapped device name or uuid string supplied.");
784 return -EINVAL; 835 return -EINVAL;
785 } 836 }
786 837
787 r = check_name(new_name); 838 if (!change_uuid) {
788 if (r) 839 r = check_name(new_data);
789 return r; 840 if (r)
841 return r;
842 }
790 843
791 md = dm_hash_rename(param, new_name); 844 md = dm_hash_rename(param, new_data);
792 if (IS_ERR(md)) 845 if (IS_ERR(md))
793 return PTR_ERR(md); 846 return PTR_ERR(md);
794 847
@@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param)
885 938
886 hc = __find_device_hash_cell(param); 939 hc = __find_device_hash_cell(param);
887 if (!hc) { 940 if (!hc) {
888 DMWARN("device doesn't appear to be in the dev hash table."); 941 DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
889 up_write(&_hash_lock); 942 up_write(&_hash_lock);
890 return -ENXIO; 943 return -ENXIO;
891 } 944 }
@@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1212 1265
1213 hc = __find_device_hash_cell(param); 1266 hc = __find_device_hash_cell(param);
1214 if (!hc) { 1267 if (!hc) {
1215 DMWARN("device doesn't appear to be in the dev hash table."); 1268 DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
1216 up_write(&_hash_lock); 1269 up_write(&_hash_lock);
1217 return -ENXIO; 1270 return -ENXIO;
1218 } 1271 }
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d8587bac5682..924f5f0084c2 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,6 +37,13 @@ struct dm_kcopyd_client {
37 unsigned int nr_pages; 37 unsigned int nr_pages;
38 unsigned int nr_free_pages; 38 unsigned int nr_free_pages;
39 39
40 /*
41 * Block devices to unplug.
42 * Non-NULL pointer means that a block device has some pending requests
43 * and needs to be unplugged.
44 */
45 struct block_device *unplug[2];
46
40 struct dm_io_client *io_client; 47 struct dm_io_client *io_client;
41 48
42 wait_queue_head_t destroyq; 49 wait_queue_head_t destroyq;
@@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job)
308 return 0; 315 return 0;
309} 316}
310 317
318/*
319 * Unplug the block device at the specified index.
320 */
321static void unplug(struct dm_kcopyd_client *kc, int rw)
322{
323 if (kc->unplug[rw] != NULL) {
324 blk_unplug(bdev_get_queue(kc->unplug[rw]));
325 kc->unplug[rw] = NULL;
326 }
327}
328
329/*
330 * Prepare block device unplug. If there's another device
331 * to be unplugged at the same array index, we unplug that
332 * device first.
333 */
334static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
335 struct block_device *bdev)
336{
337 if (likely(kc->unplug[rw] == bdev))
338 return;
339 unplug(kc, rw);
340 kc->unplug[rw] = bdev;
341}
342
311static void complete_io(unsigned long error, void *context) 343static void complete_io(unsigned long error, void *context)
312{ 344{
313 struct kcopyd_job *job = (struct kcopyd_job *) context; 345 struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job)
345{ 377{
346 int r; 378 int r;
347 struct dm_io_request io_req = { 379 struct dm_io_request io_req = {
348 .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG, 380 .bi_rw = job->rw,
349 .mem.type = DM_IO_PAGE_LIST, 381 .mem.type = DM_IO_PAGE_LIST,
350 .mem.ptr.pl = job->pages, 382 .mem.ptr.pl = job->pages,
351 .mem.offset = job->offset, 383 .mem.offset = job->offset,
@@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job)
354 .client = job->kc->io_client, 386 .client = job->kc->io_client,
355 }; 387 };
356 388
357 if (job->rw == READ) 389 if (job->rw == READ) {
358 r = dm_io(&io_req, 1, &job->source, NULL); 390 r = dm_io(&io_req, 1, &job->source, NULL);
359 else 391 prepare_unplug(job->kc, READ, job->source.bdev);
392 } else {
393 if (job->num_dests > 1)
394 io_req.bi_rw |= REQ_UNPLUG;
360 r = dm_io(&io_req, job->num_dests, job->dests, NULL); 395 r = dm_io(&io_req, job->num_dests, job->dests, NULL);
396 if (!(io_req.bi_rw & REQ_UNPLUG))
397 prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
398 }
361 399
362 return r; 400 return r;
363} 401}
@@ -435,10 +473,18 @@ static void do_work(struct work_struct *work)
435 * Pages jobs when successful will jump onto the io jobs 473 * Pages jobs when successful will jump onto the io jobs
436 * list. io jobs call wake when they complete and it all 474 * list. io jobs call wake when they complete and it all
437 * starts again. 475 * starts again.
476 *
477 * Note that io_jobs add block devices to the unplug array,
478 * this array is cleared with "unplug" calls. It is thus
479 * forbidden to run complete_jobs after io_jobs and before
480 * unplug because the block device could be destroyed in
481 * job completion callback.
438 */ 482 */
439 process_jobs(&kc->complete_jobs, kc, run_complete_job); 483 process_jobs(&kc->complete_jobs, kc, run_complete_job);
440 process_jobs(&kc->pages_jobs, kc, run_pages_job); 484 process_jobs(&kc->pages_jobs, kc, run_pages_job);
441 process_jobs(&kc->io_jobs, kc, run_io_job); 485 process_jobs(&kc->io_jobs, kc, run_io_job);
486 unplug(kc, READ);
487 unplug(kc, WRITE);
442} 488}
443 489
444/* 490/*
@@ -619,12 +665,15 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
619 INIT_LIST_HEAD(&kc->io_jobs); 665 INIT_LIST_HEAD(&kc->io_jobs);
620 INIT_LIST_HEAD(&kc->pages_jobs); 666 INIT_LIST_HEAD(&kc->pages_jobs);
621 667
668 memset(kc->unplug, 0, sizeof(kc->unplug));
669
622 kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); 670 kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
623 if (!kc->job_pool) 671 if (!kc->job_pool)
624 goto bad_slab; 672 goto bad_slab;
625 673
626 INIT_WORK(&kc->kcopyd_work, do_work); 674 INIT_WORK(&kc->kcopyd_work, do_work);
627 kc->kcopyd_wq = create_singlethread_workqueue("kcopyd"); 675 kc->kcopyd_wq = alloc_workqueue("kcopyd",
676 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
628 if (!kc->kcopyd_wq) 677 if (!kc->kcopyd_wq)
629 goto bad_workqueue; 678 goto bad_workqueue;
630 679
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 1ed0094f064b..aa2e0c374ab3 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -12,12 +12,22 @@
12 12
13#include "dm-log-userspace-transfer.h" 13#include "dm-log-userspace-transfer.h"
14 14
15#define DM_LOG_USERSPACE_VSN "1.1.0"
16
15struct flush_entry { 17struct flush_entry {
16 int type; 18 int type;
17 region_t region; 19 region_t region;
18 struct list_head list; 20 struct list_head list;
19}; 21};
20 22
23/*
24 * This limit on the number of mark and clear request is, to a degree,
25 * arbitrary. However, there is some basis for the choice in the limits
26 * imposed on the size of data payload by dm-log-userspace-transfer.c:
27 * dm_consult_userspace().
28 */
29#define MAX_FLUSH_GROUP_COUNT 32
30
21struct log_c { 31struct log_c {
22 struct dm_target *ti; 32 struct dm_target *ti;
23 uint32_t region_size; 33 uint32_t region_size;
@@ -37,8 +47,15 @@ struct log_c {
37 */ 47 */
38 uint64_t in_sync_hint; 48 uint64_t in_sync_hint;
39 49
50 /*
51 * Mark and clear requests are held until a flush is issued
52 * so that we can group, and thereby limit, the amount of
53 * network traffic between kernel and userspace. The 'flush_lock'
54 * is used to protect these lists.
55 */
40 spinlock_t flush_lock; 56 spinlock_t flush_lock;
41 struct list_head flush_list; /* only for clear and mark requests */ 57 struct list_head mark_list;
58 struct list_head clear_list;
42}; 59};
43 60
44static mempool_t *flush_entry_pool; 61static mempool_t *flush_entry_pool;
@@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
169 186
170 strncpy(lc->uuid, argv[0], DM_UUID_LEN); 187 strncpy(lc->uuid, argv[0], DM_UUID_LEN);
171 spin_lock_init(&lc->flush_lock); 188 spin_lock_init(&lc->flush_lock);
172 INIT_LIST_HEAD(&lc->flush_list); 189 INIT_LIST_HEAD(&lc->mark_list);
190 INIT_LIST_HEAD(&lc->clear_list);
173 191
174 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); 192 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
175 if (str_size < 0) { 193 if (str_size < 0) {
@@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
181 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, 199 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
182 ctr_str, str_size, NULL, NULL); 200 ctr_str, str_size, NULL, NULL);
183 201
184 if (r == -ESRCH) { 202 if (r < 0) {
185 DMERR("Userspace log server not found"); 203 if (r == -ESRCH)
204 DMERR("Userspace log server not found");
205 else
206 DMERR("Userspace log server failed to create log");
186 goto out; 207 goto out;
187 } 208 }
188 209
@@ -214,10 +235,9 @@ out:
214 235
215static void userspace_dtr(struct dm_dirty_log *log) 236static void userspace_dtr(struct dm_dirty_log *log)
216{ 237{
217 int r;
218 struct log_c *lc = log->context; 238 struct log_c *lc = log->context;
219 239
220 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, 240 (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
221 NULL, 0, 241 NULL, 0,
222 NULL, NULL); 242 NULL, NULL);
223 243
@@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
338 return (r) ? 0 : (int)in_sync; 358 return (r) ? 0 : (int)in_sync;
339} 359}
340 360
361static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
362{
363 int r = 0;
364 struct flush_entry *fe;
365
366 list_for_each_entry(fe, flush_list, list) {
367 r = userspace_do_request(lc, lc->uuid, fe->type,
368 (char *)&fe->region,
369 sizeof(fe->region),
370 NULL, NULL);
371 if (r)
372 break;
373 }
374
375 return r;
376}
377
378static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
379{
380 int r = 0;
381 int count;
382 uint32_t type = 0;
383 struct flush_entry *fe, *tmp_fe;
384 LIST_HEAD(tmp_list);
385 uint64_t group[MAX_FLUSH_GROUP_COUNT];
386
387 /*
388 * Group process the requests
389 */
390 while (!list_empty(flush_list)) {
391 count = 0;
392
393 list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
394 group[count] = fe->region;
395 count++;
396
397 list_del(&fe->list);
398 list_add(&fe->list, &tmp_list);
399
400 type = fe->type;
401 if (count >= MAX_FLUSH_GROUP_COUNT)
402 break;
403 }
404
405 r = userspace_do_request(lc, lc->uuid, type,
406 (char *)(group),
407 count * sizeof(uint64_t),
408 NULL, NULL);
409 if (r) {
410 /* Group send failed. Attempt one-by-one. */
411 list_splice_init(&tmp_list, flush_list);
412 r = flush_one_by_one(lc, flush_list);
413 break;
414 }
415 }
416
417 /*
418 * Must collect flush_entrys that were successfully processed
419 * as a group so that they will be free'd by the caller.
420 */
421 list_splice_init(&tmp_list, flush_list);
422
423 return r;
424}
425
341/* 426/*
342 * userspace_flush 427 * userspace_flush
343 * 428 *
@@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log)
360 int r = 0; 445 int r = 0;
361 unsigned long flags; 446 unsigned long flags;
362 struct log_c *lc = log->context; 447 struct log_c *lc = log->context;
363 LIST_HEAD(flush_list); 448 LIST_HEAD(mark_list);
449 LIST_HEAD(clear_list);
364 struct flush_entry *fe, *tmp_fe; 450 struct flush_entry *fe, *tmp_fe;
365 451
366 spin_lock_irqsave(&lc->flush_lock, flags); 452 spin_lock_irqsave(&lc->flush_lock, flags);
367 list_splice_init(&lc->flush_list, &flush_list); 453 list_splice_init(&lc->mark_list, &mark_list);
454 list_splice_init(&lc->clear_list, &clear_list);
368 spin_unlock_irqrestore(&lc->flush_lock, flags); 455 spin_unlock_irqrestore(&lc->flush_lock, flags);
369 456
370 if (list_empty(&flush_list)) 457 if (list_empty(&mark_list) && list_empty(&clear_list))
371 return 0; 458 return 0;
372 459
373 /* 460 r = flush_by_group(lc, &mark_list);
374 * FIXME: Count up requests, group request types, 461 if (r)
375 * allocate memory to stick all requests in and 462 goto fail;
376 * send to server in one go. Failing the allocation,
377 * do it one by one.
378 */
379 463
380 list_for_each_entry(fe, &flush_list, list) { 464 r = flush_by_group(lc, &clear_list);
381 r = userspace_do_request(lc, lc->uuid, fe->type, 465 if (r)
382 (char *)&fe->region, 466 goto fail;
383 sizeof(fe->region),
384 NULL, NULL);
385 if (r)
386 goto fail;
387 }
388 467
389 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, 468 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
390 NULL, 0, NULL, NULL); 469 NULL, 0, NULL, NULL);
@@ -395,7 +474,11 @@ fail:
395 * Calling code will receive an error and will know that 474 * Calling code will receive an error and will know that
396 * the log facility has failed. 475 * the log facility has failed.
397 */ 476 */
398 list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { 477 list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
478 list_del(&fe->list);
479 mempool_free(fe, flush_entry_pool);
480 }
481 list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
399 list_del(&fe->list); 482 list_del(&fe->list);
400 mempool_free(fe, flush_entry_pool); 483 mempool_free(fe, flush_entry_pool);
401 } 484 }
@@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
425 spin_lock_irqsave(&lc->flush_lock, flags); 508 spin_lock_irqsave(&lc->flush_lock, flags);
426 fe->type = DM_ULOG_MARK_REGION; 509 fe->type = DM_ULOG_MARK_REGION;
427 fe->region = region; 510 fe->region = region;
428 list_add(&fe->list, &lc->flush_list); 511 list_add(&fe->list, &lc->mark_list);
429 spin_unlock_irqrestore(&lc->flush_lock, flags); 512 spin_unlock_irqrestore(&lc->flush_lock, flags);
430 513
431 return; 514 return;
@@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
462 spin_lock_irqsave(&lc->flush_lock, flags); 545 spin_lock_irqsave(&lc->flush_lock, flags);
463 fe->type = DM_ULOG_CLEAR_REGION; 546 fe->type = DM_ULOG_CLEAR_REGION;
464 fe->region = region; 547 fe->region = region;
465 list_add(&fe->list, &lc->flush_list); 548 list_add(&fe->list, &lc->clear_list);
466 spin_unlock_irqrestore(&lc->flush_lock, flags); 549 spin_unlock_irqrestore(&lc->flush_lock, flags);
467 550
468 return; 551 return;
@@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void)
684 return r; 767 return r;
685 } 768 }
686 769
687 DMINFO("version 1.0.0 loaded"); 770 DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
688 return 0; 771 return 0;
689} 772}
690 773
@@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void)
694 dm_ulog_tfr_exit(); 777 dm_ulog_tfr_exit();
695 mempool_destroy(flush_entry_pool); 778 mempool_destroy(flush_entry_pool);
696 779
697 DMINFO("version 1.0.0 unloaded"); 780 DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
698 return; 781 return;
699} 782}
700 783
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 075cbcf8a9f5..049eaf12aaab 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -198,6 +198,7 @@ resend:
198 198
199 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); 199 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
200 memcpy(tfr->uuid, uuid, DM_UUID_LEN); 200 memcpy(tfr->uuid, uuid, DM_UUID_LEN);
201 tfr->version = DM_ULOG_REQUEST_VERSION;
201 tfr->luid = luid; 202 tfr->luid = luid;
202 tfr->seq = dm_ulog_seq++; 203 tfr->seq = dm_ulog_seq++;
203 204
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 33420e68d153..6951536ea29c 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -455,7 +455,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
455 r = PTR_ERR(lc->io_req.client); 455 r = PTR_ERR(lc->io_req.client);
456 DMWARN("couldn't allocate disk io client"); 456 DMWARN("couldn't allocate disk io client");
457 kfree(lc); 457 kfree(lc);
458 return -ENOMEM; 458 return r;
459 } 459 }
460 460
461 lc->disk_header = vmalloc(buf_size); 461 lc->disk_header = vmalloc(buf_size);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 487ecda90ad4..b82d28819e2a 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -23,6 +23,8 @@
23 23
24#define DM_MSG_PREFIX "multipath" 24#define DM_MSG_PREFIX "multipath"
25#define MESG_STR(x) x, sizeof(x) 25#define MESG_STR(x) x, sizeof(x)
26#define DM_PG_INIT_DELAY_MSECS 2000
27#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
26 28
27/* Path properties */ 29/* Path properties */
28struct pgpath { 30struct pgpath {
@@ -33,8 +35,7 @@ struct pgpath {
33 unsigned fail_count; /* Cumulative failure count */ 35 unsigned fail_count; /* Cumulative failure count */
34 36
35 struct dm_path path; 37 struct dm_path path;
36 struct work_struct deactivate_path; 38 struct delayed_work activate_path;
37 struct work_struct activate_path;
38}; 39};
39 40
40#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) 41#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -65,11 +66,15 @@ struct multipath {
65 66
66 const char *hw_handler_name; 67 const char *hw_handler_name;
67 char *hw_handler_params; 68 char *hw_handler_params;
69
68 unsigned nr_priority_groups; 70 unsigned nr_priority_groups;
69 struct list_head priority_groups; 71 struct list_head priority_groups;
72
73 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
74
70 unsigned pg_init_required; /* pg_init needs calling? */ 75 unsigned pg_init_required; /* pg_init needs calling? */
71 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ 76 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
72 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ 77 unsigned pg_init_delay_retry; /* Delay pg_init retry? */
73 78
74 unsigned nr_valid_paths; /* Total number of usable paths */ 79 unsigned nr_valid_paths; /* Total number of usable paths */
75 struct pgpath *current_pgpath; 80 struct pgpath *current_pgpath;
@@ -82,6 +87,7 @@ struct multipath {
82 unsigned saved_queue_if_no_path;/* Saved state during suspension */ 87 unsigned saved_queue_if_no_path;/* Saved state during suspension */
83 unsigned pg_init_retries; /* Number of times to retry pg_init */ 88 unsigned pg_init_retries; /* Number of times to retry pg_init */
84 unsigned pg_init_count; /* Number of times pg_init called */ 89 unsigned pg_init_count; /* Number of times pg_init called */
90 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
85 91
86 struct work_struct process_queued_ios; 92 struct work_struct process_queued_ios;
87 struct list_head queued_ios; 93 struct list_head queued_ios;
@@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
116static void process_queued_ios(struct work_struct *work); 122static void process_queued_ios(struct work_struct *work);
117static void trigger_event(struct work_struct *work); 123static void trigger_event(struct work_struct *work);
118static void activate_path(struct work_struct *work); 124static void activate_path(struct work_struct *work);
119static void deactivate_path(struct work_struct *work);
120 125
121 126
122/*----------------------------------------------- 127/*-----------------------------------------------
@@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void)
129 134
130 if (pgpath) { 135 if (pgpath) {
131 pgpath->is_active = 1; 136 pgpath->is_active = 1;
132 INIT_WORK(&pgpath->deactivate_path, deactivate_path); 137 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
133 INIT_WORK(&pgpath->activate_path, activate_path);
134 } 138 }
135 139
136 return pgpath; 140 return pgpath;
@@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath)
141 kfree(pgpath); 145 kfree(pgpath);
142} 146}
143 147
144static void deactivate_path(struct work_struct *work)
145{
146 struct pgpath *pgpath =
147 container_of(work, struct pgpath, deactivate_path);
148
149 blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue);
150}
151
152static struct priority_group *alloc_priority_group(void) 148static struct priority_group *alloc_priority_group(void)
153{ 149{
154 struct priority_group *pg; 150 struct priority_group *pg;
@@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
199 INIT_LIST_HEAD(&m->queued_ios); 195 INIT_LIST_HEAD(&m->queued_ios);
200 spin_lock_init(&m->lock); 196 spin_lock_init(&m->lock);
201 m->queue_io = 1; 197 m->queue_io = 1;
198 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
202 INIT_WORK(&m->process_queued_ios, process_queued_ios); 199 INIT_WORK(&m->process_queued_ios, process_queued_ios);
203 INIT_WORK(&m->trigger_event, trigger_event); 200 INIT_WORK(&m->trigger_event, trigger_event);
204 init_waitqueue_head(&m->pg_init_wait); 201 init_waitqueue_head(&m->pg_init_wait);
@@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m)
238static void __pg_init_all_paths(struct multipath *m) 235static void __pg_init_all_paths(struct multipath *m)
239{ 236{
240 struct pgpath *pgpath; 237 struct pgpath *pgpath;
238 unsigned long pg_init_delay = 0;
241 239
242 m->pg_init_count++; 240 m->pg_init_count++;
243 m->pg_init_required = 0; 241 m->pg_init_required = 0;
242 if (m->pg_init_delay_retry)
243 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
244 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
244 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { 245 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
245 /* Skip failed paths */ 246 /* Skip failed paths */
246 if (!pgpath->is_active) 247 if (!pgpath->is_active)
247 continue; 248 continue;
248 if (queue_work(kmpath_handlerd, &pgpath->activate_path)) 249 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
250 pg_init_delay))
249 m->pg_init_in_progress++; 251 m->pg_init_in_progress++;
250 } 252 }
251} 253}
@@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m)
793 const char *param_name; 795 const char *param_name;
794 796
795 static struct param _params[] = { 797 static struct param _params[] = {
796 {0, 3, "invalid number of feature args"}, 798 {0, 5, "invalid number of feature args"},
797 {1, 50, "pg_init_retries must be between 1 and 50"}, 799 {1, 50, "pg_init_retries must be between 1 and 50"},
800 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
798 }; 801 };
799 802
800 r = read_param(_params, shift(as), &argc, &ti->error); 803 r = read_param(_params, shift(as), &argc, &ti->error);
@@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m)
821 continue; 824 continue;
822 } 825 }
823 826
827 if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
828 (argc >= 1)) {
829 r = read_param(_params + 2, shift(as),
830 &m->pg_init_delay_msecs, &ti->error);
831 argc--;
832 continue;
833 }
834
824 ti->error = "Unrecognised multipath feature request"; 835 ti->error = "Unrecognised multipath feature request";
825 r = -EINVAL; 836 r = -EINVAL;
826 } while (argc && !r); 837 } while (argc && !r);
@@ -931,7 +942,7 @@ static void flush_multipath_work(struct multipath *m)
931 flush_workqueue(kmpath_handlerd); 942 flush_workqueue(kmpath_handlerd);
932 multipath_wait_for_pg_init_completion(m); 943 multipath_wait_for_pg_init_completion(m);
933 flush_workqueue(kmultipathd); 944 flush_workqueue(kmultipathd);
934 flush_scheduled_work(); 945 flush_work_sync(&m->trigger_event);
935} 946}
936 947
937static void multipath_dtr(struct dm_target *ti) 948static void multipath_dtr(struct dm_target *ti)
@@ -995,7 +1006,6 @@ static int fail_path(struct pgpath *pgpath)
995 pgpath->path.dev->name, m->nr_valid_paths); 1006 pgpath->path.dev->name, m->nr_valid_paths);
996 1007
997 schedule_work(&m->trigger_event); 1008 schedule_work(&m->trigger_event);
998 queue_work(kmultipathd, &pgpath->deactivate_path);
999 1009
1000out: 1010out:
1001 spin_unlock_irqrestore(&m->lock, flags); 1011 spin_unlock_irqrestore(&m->lock, flags);
@@ -1034,7 +1044,7 @@ static int reinstate_path(struct pgpath *pgpath)
1034 m->current_pgpath = NULL; 1044 m->current_pgpath = NULL;
1035 queue_work(kmultipathd, &m->process_queued_ios); 1045 queue_work(kmultipathd, &m->process_queued_ios);
1036 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { 1046 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1037 if (queue_work(kmpath_handlerd, &pgpath->activate_path)) 1047 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1038 m->pg_init_in_progress++; 1048 m->pg_init_in_progress++;
1039 } 1049 }
1040 1050
@@ -1169,6 +1179,7 @@ static void pg_init_done(void *data, int errors)
1169 struct priority_group *pg = pgpath->pg; 1179 struct priority_group *pg = pgpath->pg;
1170 struct multipath *m = pg->m; 1180 struct multipath *m = pg->m;
1171 unsigned long flags; 1181 unsigned long flags;
1182 unsigned delay_retry = 0;
1172 1183
1173 /* device or driver problems */ 1184 /* device or driver problems */
1174 switch (errors) { 1185 switch (errors) {
@@ -1193,8 +1204,9 @@ static void pg_init_done(void *data, int errors)
1193 */ 1204 */
1194 bypass_pg(m, pg, 1); 1205 bypass_pg(m, pg, 1);
1195 break; 1206 break;
1196 /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
1197 case SCSI_DH_RETRY: 1207 case SCSI_DH_RETRY:
1208 /* Wait before retrying. */
1209 delay_retry = 1;
1198 case SCSI_DH_IMM_RETRY: 1210 case SCSI_DH_IMM_RETRY:
1199 case SCSI_DH_RES_TEMP_UNAVAIL: 1211 case SCSI_DH_RES_TEMP_UNAVAIL:
1200 if (pg_init_limit_reached(m, pgpath)) 1212 if (pg_init_limit_reached(m, pgpath))
@@ -1227,6 +1239,7 @@ static void pg_init_done(void *data, int errors)
1227 if (!m->pg_init_required) 1239 if (!m->pg_init_required)
1228 m->queue_io = 0; 1240 m->queue_io = 0;
1229 1241
1242 m->pg_init_delay_retry = delay_retry;
1230 queue_work(kmultipathd, &m->process_queued_ios); 1243 queue_work(kmultipathd, &m->process_queued_ios);
1231 1244
1232 /* 1245 /*
@@ -1241,7 +1254,7 @@ out:
1241static void activate_path(struct work_struct *work) 1254static void activate_path(struct work_struct *work)
1242{ 1255{
1243 struct pgpath *pgpath = 1256 struct pgpath *pgpath =
1244 container_of(work, struct pgpath, activate_path); 1257 container_of(work, struct pgpath, activate_path.work);
1245 1258
1246 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), 1259 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
1247 pg_init_done, pgpath); 1260 pg_init_done, pgpath);
@@ -1382,11 +1395,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
1382 DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); 1395 DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
1383 else { 1396 else {
1384 DMEMIT("%u ", m->queue_if_no_path + 1397 DMEMIT("%u ", m->queue_if_no_path +
1385 (m->pg_init_retries > 0) * 2); 1398 (m->pg_init_retries > 0) * 2 +
1399 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
1386 if (m->queue_if_no_path) 1400 if (m->queue_if_no_path)
1387 DMEMIT("queue_if_no_path "); 1401 DMEMIT("queue_if_no_path ");
1388 if (m->pg_init_retries) 1402 if (m->pg_init_retries)
1389 DMEMIT("pg_init_retries %u ", m->pg_init_retries); 1403 DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1404 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1405 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1390 } 1406 }
1391 1407
1392 if (!m->hw_handler_name || type == STATUSTYPE_INFO) 1408 if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1655,7 +1671,7 @@ out:
1655 *---------------------------------------------------------------*/ 1671 *---------------------------------------------------------------*/
1656static struct target_type multipath_target = { 1672static struct target_type multipath_target = {
1657 .name = "multipath", 1673 .name = "multipath",
1658 .version = {1, 1, 1}, 1674 .version = {1, 2, 0},
1659 .module = THIS_MODULE, 1675 .module = THIS_MODULE,
1660 .ctr = multipath_ctr, 1676 .ctr = multipath_ctr,
1661 .dtr = multipath_dtr, 1677 .dtr = multipath_dtr,
@@ -1687,7 +1703,7 @@ static int __init dm_multipath_init(void)
1687 return -EINVAL; 1703 return -EINVAL;
1688 } 1704 }
1689 1705
1690 kmultipathd = create_workqueue("kmpathd"); 1706 kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
1691 if (!kmultipathd) { 1707 if (!kmultipathd) {
1692 DMERR("failed to create workqueue kmpathd"); 1708 DMERR("failed to create workqueue kmpathd");
1693 dm_unregister_target(&multipath_target); 1709 dm_unregister_target(&multipath_target);
@@ -1701,7 +1717,8 @@ static int __init dm_multipath_init(void)
1701 * old workqueue would also create a bottleneck in the 1717 * old workqueue would also create a bottleneck in the
1702 * path of the storage hardware device activation. 1718 * path of the storage hardware device activation.
1703 */ 1719 */
1704 kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd"); 1720 kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
1721 WQ_MEM_RECLAIM);
1705 if (!kmpath_handlerd) { 1722 if (!kmpath_handlerd) {
1706 DMERR("failed to create workqueue kmpath_handlerd"); 1723 DMERR("failed to create workqueue kmpath_handlerd");
1707 destroy_workqueue(kmultipathd); 1724 destroy_workqueue(kmultipathd);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644
index 000000000000..b9e1e15ef11c
--- /dev/null
+++ b/drivers/md/dm-raid.c
@@ -0,0 +1,697 @@
1/*
2 * Copyright (C) 2010-2011 Neil Brown
3 * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include <linux/slab.h>
9
10#include "md.h"
11#include "raid5.h"
12#include "dm.h"
13#include "bitmap.h"
14
15#define DM_MSG_PREFIX "raid"
16
17/*
18 * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
19 * make it so the flag doesn't set anything.
20 */
21#ifndef MD_SYNC_STATE_FORCED
22#define MD_SYNC_STATE_FORCED 0
23#endif
24
25struct raid_dev {
26 /*
27 * Two DM devices, one to hold metadata and one to hold the
28 * actual data/parity. The reason for this is to not confuse
29 * ti->len and give more flexibility in altering size and
30 * characteristics.
31 *
32 * While it is possible for this device to be associated
33 * with a different physical device than the data_dev, it
34 * is intended for it to be the same.
35 * |--------- Physical Device ---------|
36 * |- meta_dev -|------ data_dev ------|
37 */
38 struct dm_dev *meta_dev;
39 struct dm_dev *data_dev;
40 struct mdk_rdev_s rdev;
41};
42
43/*
44 * Flags for rs->print_flags field.
45 */
46#define DMPF_DAEMON_SLEEP 0x1
47#define DMPF_MAX_WRITE_BEHIND 0x2
48#define DMPF_SYNC 0x4
49#define DMPF_NOSYNC 0x8
50#define DMPF_STRIPE_CACHE 0x10
51#define DMPF_MIN_RECOVERY_RATE 0x20
52#define DMPF_MAX_RECOVERY_RATE 0x40
53
54struct raid_set {
55 struct dm_target *ti;
56
57 uint64_t print_flags;
58
59 struct mddev_s md;
60 struct raid_type *raid_type;
61 struct dm_target_callbacks callbacks;
62
63 struct raid_dev dev[0];
64};
65
66/* Supported raid types and properties. */
67static struct raid_type {
68 const char *name; /* RAID algorithm. */
69 const char *descr; /* Descriptor text for logging. */
70 const unsigned parity_devs; /* # of parity devices. */
71 const unsigned minimal_devs; /* minimal # of devices in set. */
72 const unsigned level; /* RAID level. */
73 const unsigned algorithm; /* RAID algorithm. */
74} raid_types[] = {
75 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
76 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
77 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
78 {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
79 {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
80 {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
81 {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
82 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
83};
84
85static struct raid_type *get_raid_type(char *name)
86{
87 int i;
88
89 for (i = 0; i < ARRAY_SIZE(raid_types); i++)
90 if (!strcmp(raid_types[i].name, name))
91 return &raid_types[i];
92
93 return NULL;
94}
95
96static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
97{
98 unsigned i;
99 struct raid_set *rs;
100 sector_t sectors_per_dev;
101
102 if (raid_devs <= raid_type->parity_devs) {
103 ti->error = "Insufficient number of devices";
104 return ERR_PTR(-EINVAL);
105 }
106
107 sectors_per_dev = ti->len;
108 if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
109 ti->error = "Target length not divisible by number of data devices";
110 return ERR_PTR(-EINVAL);
111 }
112
113 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
114 if (!rs) {
115 ti->error = "Cannot allocate raid context";
116 return ERR_PTR(-ENOMEM);
117 }
118
119 mddev_init(&rs->md);
120
121 rs->ti = ti;
122 rs->raid_type = raid_type;
123 rs->md.raid_disks = raid_devs;
124 rs->md.level = raid_type->level;
125 rs->md.new_level = rs->md.level;
126 rs->md.dev_sectors = sectors_per_dev;
127 rs->md.layout = raid_type->algorithm;
128 rs->md.new_layout = rs->md.layout;
129 rs->md.delta_disks = 0;
130 rs->md.recovery_cp = 0;
131
132 for (i = 0; i < raid_devs; i++)
133 md_rdev_init(&rs->dev[i].rdev);
134
135 /*
136 * Remaining items to be initialized by further RAID params:
137 * rs->md.persistent
138 * rs->md.external
139 * rs->md.chunk_sectors
140 * rs->md.new_chunk_sectors
141 */
142
143 return rs;
144}
145
146static void context_free(struct raid_set *rs)
147{
148 int i;
149
150 for (i = 0; i < rs->md.raid_disks; i++)
151 if (rs->dev[i].data_dev)
152 dm_put_device(rs->ti, rs->dev[i].data_dev);
153
154 kfree(rs);
155}
156
157/*
158 * For every device we have two words
159 * <meta_dev>: meta device name or '-' if missing
160 * <data_dev>: data device name or '-' if missing
161 *
162 * This code parses those words.
163 */
164static int dev_parms(struct raid_set *rs, char **argv)
165{
166 int i;
167 int rebuild = 0;
168 int metadata_available = 0;
169 int ret = 0;
170
171 for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
172 rs->dev[i].rdev.raid_disk = i;
173
174 rs->dev[i].meta_dev = NULL;
175 rs->dev[i].data_dev = NULL;
176
177 /*
178 * There are no offsets, since there is a separate device
179 * for data and metadata.
180 */
181 rs->dev[i].rdev.data_offset = 0;
182 rs->dev[i].rdev.mddev = &rs->md;
183
184 if (strcmp(argv[0], "-")) {
185 rs->ti->error = "Metadata devices not supported";
186 return -EINVAL;
187 }
188
189 if (!strcmp(argv[1], "-")) {
190 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
191 (!rs->dev[i].rdev.recovery_offset)) {
192 rs->ti->error = "Drive designated for rebuild not specified";
193 return -EINVAL;
194 }
195
196 continue;
197 }
198
199 ret = dm_get_device(rs->ti, argv[1],
200 dm_table_get_mode(rs->ti->table),
201 &rs->dev[i].data_dev);
202 if (ret) {
203 rs->ti->error = "RAID device lookup failure";
204 return ret;
205 }
206
207 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
208 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
209 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
210 rebuild++;
211 }
212
213 if (metadata_available) {
214 rs->md.external = 0;
215 rs->md.persistent = 1;
216 rs->md.major_version = 2;
217 } else if (rebuild && !rs->md.recovery_cp) {
218 /*
219 * Without metadata, we will not be able to tell if the array
220 * is in-sync or not - we must assume it is not. Therefore,
221 * it is impossible to rebuild a drive.
222 *
223 * Even if there is metadata, the on-disk information may
224 * indicate that the array is not in-sync and it will then
225 * fail at that time.
226 *
227 * User could specify 'nosync' option if desperate.
228 */
229 DMERR("Unable to rebuild drive while array is not in-sync");
230 rs->ti->error = "RAID device lookup failure";
231 return -EINVAL;
232 }
233
234 return 0;
235}
236
237/*
238 * Possible arguments are...
239 * RAID456:
240 * <chunk_size> [optional_args]
241 *
242 * Optional args:
243 * [[no]sync] Force or prevent recovery of the entire array
244 * [rebuild <idx>] Rebuild the drive indicated by the index
245 * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits
246 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
247 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
248 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
249 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
250 */
251static int parse_raid_params(struct raid_set *rs, char **argv,
252 unsigned num_raid_params)
253{
254 unsigned i, rebuild_cnt = 0;
255 unsigned long value;
256 char *key;
257
258 /*
259 * First, parse the in-order required arguments
260 */
261 if ((strict_strtoul(argv[0], 10, &value) < 0) ||
262 !is_power_of_2(value) || (value < 8)) {
263 rs->ti->error = "Bad chunk size";
264 return -EINVAL;
265 }
266
267 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
268 argv++;
269 num_raid_params--;
270
271 /*
272 * Second, parse the unordered optional arguments
273 */
274 for (i = 0; i < rs->md.raid_disks; i++)
275 set_bit(In_sync, &rs->dev[i].rdev.flags);
276
277 for (i = 0; i < num_raid_params; i++) {
278 if (!strcmp(argv[i], "nosync")) {
279 rs->md.recovery_cp = MaxSector;
280 rs->print_flags |= DMPF_NOSYNC;
281 rs->md.flags |= MD_SYNC_STATE_FORCED;
282 continue;
283 }
284 if (!strcmp(argv[i], "sync")) {
285 rs->md.recovery_cp = 0;
286 rs->print_flags |= DMPF_SYNC;
287 rs->md.flags |= MD_SYNC_STATE_FORCED;
288 continue;
289 }
290
291 /* The rest of the optional arguments come in key/value pairs */
292 if ((i + 1) >= num_raid_params) {
293 rs->ti->error = "Wrong number of raid parameters given";
294 return -EINVAL;
295 }
296
297 key = argv[i++];
298 if (strict_strtoul(argv[i], 10, &value) < 0) {
299 rs->ti->error = "Bad numerical argument given in raid params";
300 return -EINVAL;
301 }
302
303 if (!strcmp(key, "rebuild")) {
304 if (++rebuild_cnt > rs->raid_type->parity_devs) {
305 rs->ti->error = "Too many rebuild drives given";
306 return -EINVAL;
307 }
308 if (value > rs->md.raid_disks) {
309 rs->ti->error = "Invalid rebuild index given";
310 return -EINVAL;
311 }
312 clear_bit(In_sync, &rs->dev[value].rdev.flags);
313 rs->dev[value].rdev.recovery_offset = 0;
314 } else if (!strcmp(key, "max_write_behind")) {
315 rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
316
317 /*
318 * In device-mapper, we specify things in sectors, but
319 * MD records this value in kB
320 */
321 value /= 2;
322 if (value > COUNTER_MAX) {
323 rs->ti->error = "Max write-behind limit out of range";
324 return -EINVAL;
325 }
326 rs->md.bitmap_info.max_write_behind = value;
327 } else if (!strcmp(key, "daemon_sleep")) {
328 rs->print_flags |= DMPF_DAEMON_SLEEP;
329 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
330 rs->ti->error = "daemon sleep period out of range";
331 return -EINVAL;
332 }
333 rs->md.bitmap_info.daemon_sleep = value;
334 } else if (!strcmp(key, "stripe_cache")) {
335 rs->print_flags |= DMPF_STRIPE_CACHE;
336
337 /*
338 * In device-mapper, we specify things in sectors, but
339 * MD records this value in kB
340 */
341 value /= 2;
342
343 if (rs->raid_type->level < 5) {
344 rs->ti->error = "Inappropriate argument: stripe_cache";
345 return -EINVAL;
346 }
347 if (raid5_set_cache_size(&rs->md, (int)value)) {
348 rs->ti->error = "Bad stripe_cache size";
349 return -EINVAL;
350 }
351 } else if (!strcmp(key, "min_recovery_rate")) {
352 rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
353 if (value > INT_MAX) {
354 rs->ti->error = "min_recovery_rate out of range";
355 return -EINVAL;
356 }
357 rs->md.sync_speed_min = (int)value;
358 } else if (!strcmp(key, "max_recovery_rate")) {
359 rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
360 if (value > INT_MAX) {
361 rs->ti->error = "max_recovery_rate out of range";
362 return -EINVAL;
363 }
364 rs->md.sync_speed_max = (int)value;
365 } else {
366 DMERR("Unable to parse RAID parameter: %s", key);
367 rs->ti->error = "Unable to parse RAID parameters";
368 return -EINVAL;
369 }
370 }
371
372 /* Assume there are no metadata devices until the drives are parsed */
373 rs->md.persistent = 0;
374 rs->md.external = 1;
375
376 return 0;
377}
378
379static void do_table_event(struct work_struct *ws)
380{
381 struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
382
383 dm_table_event(rs->ti->table);
384}
385
386static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
387{
388 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
389
390 return md_raid5_congested(&rs->md, bits);
391}
392
393static void raid_unplug(struct dm_target_callbacks *cb)
394{
395 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
396
397 md_raid5_unplug_device(rs->md.private);
398}
399
400/*
401 * Construct a RAID4/5/6 mapping:
402 * Args:
403 * <raid_type> <#raid_params> <raid_params> \
404 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
405 *
406 * ** metadata devices are not supported yet, use '-' instead **
407 *
408 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
409 * details on possible <raid_params>.
410 */
411static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
412{
413 int ret;
414 struct raid_type *rt;
415 unsigned long num_raid_params, num_raid_devs;
416 struct raid_set *rs = NULL;
417
418 /* Must have at least <raid_type> <#raid_params> */
419 if (argc < 2) {
420 ti->error = "Too few arguments";
421 return -EINVAL;
422 }
423
424 /* raid type */
425 rt = get_raid_type(argv[0]);
426 if (!rt) {
427 ti->error = "Unrecognised raid_type";
428 return -EINVAL;
429 }
430 argc--;
431 argv++;
432
433 /* number of RAID parameters */
434 if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
435 ti->error = "Cannot understand number of RAID parameters";
436 return -EINVAL;
437 }
438 argc--;
439 argv++;
440
441 /* Skip over RAID params for now and find out # of devices */
442 if (num_raid_params + 1 > argc) {
443 ti->error = "Arguments do not agree with counts given";
444 return -EINVAL;
445 }
446
447 if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
448 (num_raid_devs >= INT_MAX)) {
449 ti->error = "Cannot understand number of raid devices";
450 return -EINVAL;
451 }
452
453 rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
454 if (IS_ERR(rs))
455 return PTR_ERR(rs);
456
457 ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
458 if (ret)
459 goto bad;
460
461 ret = -EINVAL;
462
463 argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
464 argv += num_raid_params + 1;
465
466 if (argc != (num_raid_devs * 2)) {
467 ti->error = "Supplied RAID devices does not match the count given";
468 goto bad;
469 }
470
471 ret = dev_parms(rs, argv);
472 if (ret)
473 goto bad;
474
475 INIT_WORK(&rs->md.event_work, do_table_event);
476 ti->split_io = rs->md.chunk_sectors;
477 ti->private = rs;
478
479 mutex_lock(&rs->md.reconfig_mutex);
480 ret = md_run(&rs->md);
481 rs->md.in_sync = 0; /* Assume already marked dirty */
482 mutex_unlock(&rs->md.reconfig_mutex);
483
484 if (ret) {
485 ti->error = "Fail to run raid array";
486 goto bad;
487 }
488
489 rs->callbacks.congested_fn = raid_is_congested;
490 rs->callbacks.unplug_fn = raid_unplug;
491 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
492
493 return 0;
494
495bad:
496 context_free(rs);
497
498 return ret;
499}
500
501static void raid_dtr(struct dm_target *ti)
502{
503 struct raid_set *rs = ti->private;
504
505 list_del_init(&rs->callbacks.list);
506 md_stop(&rs->md);
507 context_free(rs);
508}
509
510static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
511{
512 struct raid_set *rs = ti->private;
513 mddev_t *mddev = &rs->md;
514
515 mddev->pers->make_request(mddev, bio);
516
517 return DM_MAPIO_SUBMITTED;
518}
519
520static int raid_status(struct dm_target *ti, status_type_t type,
521 char *result, unsigned maxlen)
522{
523 struct raid_set *rs = ti->private;
524 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
525 unsigned sz = 0;
526 int i;
527 sector_t sync;
528
529 switch (type) {
530 case STATUSTYPE_INFO:
531 DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
532
533 for (i = 0; i < rs->md.raid_disks; i++) {
534 if (test_bit(Faulty, &rs->dev[i].rdev.flags))
535 DMEMIT("D");
536 else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
537 DMEMIT("A");
538 else
539 DMEMIT("a");
540 }
541
542 if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
543 sync = rs->md.curr_resync_completed;
544 else
545 sync = rs->md.recovery_cp;
546
547 if (sync > rs->md.resync_max_sectors)
548 sync = rs->md.resync_max_sectors;
549
550 DMEMIT(" %llu/%llu",
551 (unsigned long long) sync,
552 (unsigned long long) rs->md.resync_max_sectors);
553
554 break;
555 case STATUSTYPE_TABLE:
556 /* The string you would use to construct this array */
557 for (i = 0; i < rs->md.raid_disks; i++)
558 if (rs->dev[i].data_dev &&
559 !test_bit(In_sync, &rs->dev[i].rdev.flags))
560 raid_param_cnt++; /* for rebuilds */
561
562 raid_param_cnt += (hweight64(rs->print_flags) * 2);
563 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
564 raid_param_cnt--;
565
566 DMEMIT("%s %u %u", rs->raid_type->name,
567 raid_param_cnt, rs->md.chunk_sectors);
568
569 if ((rs->print_flags & DMPF_SYNC) &&
570 (rs->md.recovery_cp == MaxSector))
571 DMEMIT(" sync");
572 if (rs->print_flags & DMPF_NOSYNC)
573 DMEMIT(" nosync");
574
575 for (i = 0; i < rs->md.raid_disks; i++)
576 if (rs->dev[i].data_dev &&
577 !test_bit(In_sync, &rs->dev[i].rdev.flags))
578 DMEMIT(" rebuild %u", i);
579
580 if (rs->print_flags & DMPF_DAEMON_SLEEP)
581 DMEMIT(" daemon_sleep %lu",
582 rs->md.bitmap_info.daemon_sleep);
583
584 if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
585 DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
586
587 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
588 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
589
590 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
591 DMEMIT(" max_write_behind %lu",
592 rs->md.bitmap_info.max_write_behind);
593
594 if (rs->print_flags & DMPF_STRIPE_CACHE) {
595 raid5_conf_t *conf = rs->md.private;
596
597 /* convert from kiB to sectors */
598 DMEMIT(" stripe_cache %d",
599 conf ? conf->max_nr_stripes * 2 : 0);
600 }
601
602 DMEMIT(" %d", rs->md.raid_disks);
603 for (i = 0; i < rs->md.raid_disks; i++) {
604 DMEMIT(" -"); /* metadata device */
605
606 if (rs->dev[i].data_dev)
607 DMEMIT(" %s", rs->dev[i].data_dev->name);
608 else
609 DMEMIT(" -");
610 }
611 }
612
613 return 0;
614}
615
616static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
617{
618 struct raid_set *rs = ti->private;
619 unsigned i;
620 int ret = 0;
621
622 for (i = 0; !ret && i < rs->md.raid_disks; i++)
623 if (rs->dev[i].data_dev)
624 ret = fn(ti,
625 rs->dev[i].data_dev,
626 0, /* No offset on data devs */
627 rs->md.dev_sectors,
628 data);
629
630 return ret;
631}
632
633static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
634{
635 struct raid_set *rs = ti->private;
636 unsigned chunk_size = rs->md.chunk_sectors << 9;
637 raid5_conf_t *conf = rs->md.private;
638
639 blk_limits_io_min(limits, chunk_size);
640 blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
641}
642
643static void raid_presuspend(struct dm_target *ti)
644{
645 struct raid_set *rs = ti->private;
646
647 md_stop_writes(&rs->md);
648}
649
650static void raid_postsuspend(struct dm_target *ti)
651{
652 struct raid_set *rs = ti->private;
653
654 mddev_suspend(&rs->md);
655}
656
657static void raid_resume(struct dm_target *ti)
658{
659 struct raid_set *rs = ti->private;
660
661 mddev_resume(&rs->md);
662}
663
664static struct target_type raid_target = {
665 .name = "raid",
666 .version = {1, 0, 0},
667 .module = THIS_MODULE,
668 .ctr = raid_ctr,
669 .dtr = raid_dtr,
670 .map = raid_map,
671 .status = raid_status,
672 .iterate_devices = raid_iterate_devices,
673 .io_hints = raid_io_hints,
674 .presuspend = raid_presuspend,
675 .postsuspend = raid_postsuspend,
676 .resume = raid_resume,
677};
678
679static int __init dm_raid_init(void)
680{
681 return dm_register_target(&raid_target);
682}
683
684static void __exit dm_raid_exit(void)
685{
686 dm_unregister_target(&raid_target);
687}
688
689module_init(dm_raid_init);
690module_exit(dm_raid_exit);
691
692MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
693MODULE_ALIAS("dm-raid4");
694MODULE_ALIAS("dm-raid5");
695MODULE_ALIAS("dm-raid6");
696MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
697MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 19a59b041c27..dee326775c60 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
261 struct dm_io_request io_req = { 261 struct dm_io_request io_req = {
262 .bi_rw = WRITE_FLUSH, 262 .bi_rw = WRITE_FLUSH,
263 .mem.type = DM_IO_KMEM, 263 .mem.type = DM_IO_KMEM,
264 .mem.ptr.bvec = NULL, 264 .mem.ptr.addr = NULL,
265 .client = ms->io_client, 265 .client = ms->io_client,
266 }; 266 };
267 267
@@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
637 .client = ms->io_client, 637 .client = ms->io_client,
638 }; 638 };
639 639
640 if (bio->bi_rw & REQ_DISCARD) {
641 io_req.bi_rw |= REQ_DISCARD;
642 io_req.mem.type = DM_IO_KMEM;
643 io_req.mem.ptr.addr = NULL;
644 }
645
640 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) 646 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
641 map_region(dest++, m, bio); 647 map_region(dest++, m, bio);
642 648
@@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
670 bio_list_init(&requeue); 676 bio_list_init(&requeue);
671 677
672 while ((bio = bio_list_pop(writes))) { 678 while ((bio = bio_list_pop(writes))) {
673 if (bio->bi_rw & REQ_FLUSH) { 679 if ((bio->bi_rw & REQ_FLUSH) ||
680 (bio->bi_rw & REQ_DISCARD)) {
674 bio_list_add(&sync, bio); 681 bio_list_add(&sync, bio);
675 continue; 682 continue;
676 } 683 }
@@ -1076,8 +1083,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1076 ti->private = ms; 1083 ti->private = ms;
1077 ti->split_io = dm_rh_get_region_size(ms->rh); 1084 ti->split_io = dm_rh_get_region_size(ms->rh);
1078 ti->num_flush_requests = 1; 1085 ti->num_flush_requests = 1;
1086 ti->num_discard_requests = 1;
1079 1087
1080 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); 1088 ms->kmirrord_wq = alloc_workqueue("kmirrord",
1089 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
1081 if (!ms->kmirrord_wq) { 1090 if (!ms->kmirrord_wq) {
1082 DMERR("couldn't start kmirrord"); 1091 DMERR("couldn't start kmirrord");
1083 r = -ENOMEM; 1092 r = -ENOMEM;
@@ -1130,7 +1139,7 @@ static void mirror_dtr(struct dm_target *ti)
1130 1139
1131 del_timer_sync(&ms->timer); 1140 del_timer_sync(&ms->timer);
1132 flush_workqueue(ms->kmirrord_wq); 1141 flush_workqueue(ms->kmirrord_wq);
1133 flush_scheduled_work(); 1142 flush_work_sync(&ms->trigger_event);
1134 dm_kcopyd_client_destroy(ms->kcopyd_client); 1143 dm_kcopyd_client_destroy(ms->kcopyd_client);
1135 destroy_workqueue(ms->kmirrord_wq); 1144 destroy_workqueue(ms->kmirrord_wq);
1136 free_context(ms, ti, ms->nr_mirrors); 1145 free_context(ms, ti, ms->nr_mirrors);
@@ -1406,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
1406 1415
1407static struct target_type mirror_target = { 1416static struct target_type mirror_target = {
1408 .name = "mirror", 1417 .name = "mirror",
1409 .version = {1, 12, 0}, 1418 .version = {1, 12, 1},
1410 .module = THIS_MODULE, 1419 .module = THIS_MODULE,
1411 .ctr = mirror_ctr, 1420 .ctr = mirror_ctr,
1412 .dtr = mirror_dtr, 1421 .dtr = mirror_dtr,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2129cdb115dc..95891dfcbca0 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
256 */ 256 */
257 INIT_WORK_ONSTACK(&req.work, do_metadata); 257 INIT_WORK_ONSTACK(&req.work, do_metadata);
258 queue_work(ps->metadata_wq, &req.work); 258 queue_work(ps->metadata_wq, &req.work);
259 flush_workqueue(ps->metadata_wq); 259 flush_work(&req.work);
260 260
261 return req.result; 261 return req.result;
262} 262}
@@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store,
818 atomic_set(&ps->pending_count, 0); 818 atomic_set(&ps->pending_count, 0);
819 ps->callbacks = NULL; 819 ps->callbacks = NULL;
820 820
821 ps->metadata_wq = create_singlethread_workqueue("ksnaphd"); 821 ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
822 if (!ps->metadata_wq) { 822 if (!ps->metadata_wq) {
823 kfree(ps); 823 kfree(ps);
824 DMERR("couldn't start header metadata update thread"); 824 DMERR("couldn't start header metadata update thread");
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 53cf79d8bcbc..fdde53cd12b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,7 +19,6 @@
19#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
20#include <linux/log2.h> 20#include <linux/log2.h>
21#include <linux/dm-kcopyd.h> 21#include <linux/dm-kcopyd.h>
22#include <linux/workqueue.h>
23 22
24#include "dm-exception-store.h" 23#include "dm-exception-store.h"
25 24
@@ -80,9 +79,6 @@ struct dm_snapshot {
80 /* Origin writes don't trigger exceptions until this is set */ 79 /* Origin writes don't trigger exceptions until this is set */
81 int active; 80 int active;
82 81
83 /* Whether or not owning mapped_device is suspended */
84 int suspended;
85
86 atomic_t pending_exceptions_count; 82 atomic_t pending_exceptions_count;
87 83
88 mempool_t *pending_pool; 84 mempool_t *pending_pool;
@@ -106,10 +102,6 @@ struct dm_snapshot {
106 102
107 struct dm_kcopyd_client *kcopyd_client; 103 struct dm_kcopyd_client *kcopyd_client;
108 104
109 /* Queue of snapshot writes for ksnapd to flush */
110 struct bio_list queued_bios;
111 struct work_struct queued_bios_work;
112
113 /* Wait for events based on state_bits */ 105 /* Wait for events based on state_bits */
114 unsigned long state_bits; 106 unsigned long state_bits;
115 107
@@ -160,9 +152,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
160} 152}
161EXPORT_SYMBOL(dm_snap_cow); 153EXPORT_SYMBOL(dm_snap_cow);
162 154
163static struct workqueue_struct *ksnapd;
164static void flush_queued_bios(struct work_struct *work);
165
166static sector_t chunk_to_sector(struct dm_exception_store *store, 155static sector_t chunk_to_sector(struct dm_exception_store *store,
167 chunk_t chunk) 156 chunk_t chunk)
168{ 157{
@@ -1110,7 +1099,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1110 s->ti = ti; 1099 s->ti = ti;
1111 s->valid = 1; 1100 s->valid = 1;
1112 s->active = 0; 1101 s->active = 0;
1113 s->suspended = 0;
1114 atomic_set(&s->pending_exceptions_count, 0); 1102 atomic_set(&s->pending_exceptions_count, 0);
1115 init_rwsem(&s->lock); 1103 init_rwsem(&s->lock);
1116 INIT_LIST_HEAD(&s->list); 1104 INIT_LIST_HEAD(&s->list);
@@ -1153,9 +1141,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1153 1141
1154 spin_lock_init(&s->tracked_chunk_lock); 1142 spin_lock_init(&s->tracked_chunk_lock);
1155 1143
1156 bio_list_init(&s->queued_bios);
1157 INIT_WORK(&s->queued_bios_work, flush_queued_bios);
1158
1159 ti->private = s; 1144 ti->private = s;
1160 ti->num_flush_requests = num_flush_requests; 1145 ti->num_flush_requests = num_flush_requests;
1161 1146
@@ -1279,8 +1264,6 @@ static void snapshot_dtr(struct dm_target *ti)
1279 struct dm_snapshot *s = ti->private; 1264 struct dm_snapshot *s = ti->private;
1280 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; 1265 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1281 1266
1282 flush_workqueue(ksnapd);
1283
1284 down_read(&_origins_lock); 1267 down_read(&_origins_lock);
1285 /* Check whether exception handover must be cancelled */ 1268 /* Check whether exception handover must be cancelled */
1286 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); 1269 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
@@ -1342,20 +1325,6 @@ static void flush_bios(struct bio *bio)
1342 } 1325 }
1343} 1326}
1344 1327
1345static void flush_queued_bios(struct work_struct *work)
1346{
1347 struct dm_snapshot *s =
1348 container_of(work, struct dm_snapshot, queued_bios_work);
1349 struct bio *queued_bios;
1350 unsigned long flags;
1351
1352 spin_lock_irqsave(&s->pe_lock, flags);
1353 queued_bios = bio_list_get(&s->queued_bios);
1354 spin_unlock_irqrestore(&s->pe_lock, flags);
1355
1356 flush_bios(queued_bios);
1357}
1358
1359static int do_origin(struct dm_dev *origin, struct bio *bio); 1328static int do_origin(struct dm_dev *origin, struct bio *bio);
1360 1329
1361/* 1330/*
@@ -1760,15 +1729,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti)
1760 stop_merge(s); 1729 stop_merge(s);
1761} 1730}
1762 1731
1763static void snapshot_postsuspend(struct dm_target *ti)
1764{
1765 struct dm_snapshot *s = ti->private;
1766
1767 down_write(&s->lock);
1768 s->suspended = 1;
1769 up_write(&s->lock);
1770}
1771
1772static int snapshot_preresume(struct dm_target *ti) 1732static int snapshot_preresume(struct dm_target *ti)
1773{ 1733{
1774 int r = 0; 1734 int r = 0;
@@ -1783,7 +1743,7 @@ static int snapshot_preresume(struct dm_target *ti)
1783 DMERR("Unable to resume snapshot source until " 1743 DMERR("Unable to resume snapshot source until "
1784 "handover completes."); 1744 "handover completes.");
1785 r = -EINVAL; 1745 r = -EINVAL;
1786 } else if (!snap_src->suspended) { 1746 } else if (!dm_suspended(snap_src->ti)) {
1787 DMERR("Unable to perform snapshot handover until " 1747 DMERR("Unable to perform snapshot handover until "
1788 "source is suspended."); 1748 "source is suspended.");
1789 r = -EINVAL; 1749 r = -EINVAL;
@@ -1816,7 +1776,6 @@ static void snapshot_resume(struct dm_target *ti)
1816 1776
1817 down_write(&s->lock); 1777 down_write(&s->lock);
1818 s->active = 1; 1778 s->active = 1;
1819 s->suspended = 0;
1820 up_write(&s->lock); 1779 up_write(&s->lock);
1821} 1780}
1822 1781
@@ -2194,7 +2153,7 @@ static int origin_iterate_devices(struct dm_target *ti,
2194 2153
2195static struct target_type origin_target = { 2154static struct target_type origin_target = {
2196 .name = "snapshot-origin", 2155 .name = "snapshot-origin",
2197 .version = {1, 7, 0}, 2156 .version = {1, 7, 1},
2198 .module = THIS_MODULE, 2157 .module = THIS_MODULE,
2199 .ctr = origin_ctr, 2158 .ctr = origin_ctr,
2200 .dtr = origin_dtr, 2159 .dtr = origin_dtr,
@@ -2207,13 +2166,12 @@ static struct target_type origin_target = {
2207 2166
2208static struct target_type snapshot_target = { 2167static struct target_type snapshot_target = {
2209 .name = "snapshot", 2168 .name = "snapshot",
2210 .version = {1, 9, 0}, 2169 .version = {1, 10, 0},
2211 .module = THIS_MODULE, 2170 .module = THIS_MODULE,
2212 .ctr = snapshot_ctr, 2171 .ctr = snapshot_ctr,
2213 .dtr = snapshot_dtr, 2172 .dtr = snapshot_dtr,
2214 .map = snapshot_map, 2173 .map = snapshot_map,
2215 .end_io = snapshot_end_io, 2174 .end_io = snapshot_end_io,
2216 .postsuspend = snapshot_postsuspend,
2217 .preresume = snapshot_preresume, 2175 .preresume = snapshot_preresume,
2218 .resume = snapshot_resume, 2176 .resume = snapshot_resume,
2219 .status = snapshot_status, 2177 .status = snapshot_status,
@@ -2222,14 +2180,13 @@ static struct target_type snapshot_target = {
2222 2180
2223static struct target_type merge_target = { 2181static struct target_type merge_target = {
2224 .name = dm_snapshot_merge_target_name, 2182 .name = dm_snapshot_merge_target_name,
2225 .version = {1, 0, 0}, 2183 .version = {1, 1, 0},
2226 .module = THIS_MODULE, 2184 .module = THIS_MODULE,
2227 .ctr = snapshot_ctr, 2185 .ctr = snapshot_ctr,
2228 .dtr = snapshot_dtr, 2186 .dtr = snapshot_dtr,
2229 .map = snapshot_merge_map, 2187 .map = snapshot_merge_map,
2230 .end_io = snapshot_end_io, 2188 .end_io = snapshot_end_io,
2231 .presuspend = snapshot_merge_presuspend, 2189 .presuspend = snapshot_merge_presuspend,
2232 .postsuspend = snapshot_postsuspend,
2233 .preresume = snapshot_preresume, 2190 .preresume = snapshot_preresume,
2234 .resume = snapshot_merge_resume, 2191 .resume = snapshot_merge_resume,
2235 .status = snapshot_status, 2192 .status = snapshot_status,
@@ -2291,17 +2248,8 @@ static int __init dm_snapshot_init(void)
2291 goto bad_tracked_chunk_cache; 2248 goto bad_tracked_chunk_cache;
2292 } 2249 }
2293 2250
2294 ksnapd = create_singlethread_workqueue("ksnapd");
2295 if (!ksnapd) {
2296 DMERR("Failed to create ksnapd workqueue.");
2297 r = -ENOMEM;
2298 goto bad_pending_pool;
2299 }
2300
2301 return 0; 2251 return 0;
2302 2252
2303bad_pending_pool:
2304 kmem_cache_destroy(tracked_chunk_cache);
2305bad_tracked_chunk_cache: 2253bad_tracked_chunk_cache:
2306 kmem_cache_destroy(pending_cache); 2254 kmem_cache_destroy(pending_cache);
2307bad_pending_cache: 2255bad_pending_cache:
@@ -2322,8 +2270,6 @@ bad_register_snapshot_target:
2322 2270
2323static void __exit dm_snapshot_exit(void) 2271static void __exit dm_snapshot_exit(void)
2324{ 2272{
2325 destroy_workqueue(ksnapd);
2326
2327 dm_unregister_target(&snapshot_target); 2273 dm_unregister_target(&snapshot_target);
2328 dm_unregister_target(&origin_target); 2274 dm_unregister_target(&origin_target);
2329 dm_unregister_target(&merge_target); 2275 dm_unregister_target(&merge_target);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index f0371b4c4fbf..dddfa14f2982 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -39,23 +39,20 @@ struct stripe_c {
39 struct dm_target *ti; 39 struct dm_target *ti;
40 40
41 /* Work struct used for triggering events*/ 41 /* Work struct used for triggering events*/
42 struct work_struct kstriped_ws; 42 struct work_struct trigger_event;
43 43
44 struct stripe stripe[0]; 44 struct stripe stripe[0];
45}; 45};
46 46
47static struct workqueue_struct *kstriped;
48
49/* 47/*
50 * An event is triggered whenever a drive 48 * An event is triggered whenever a drive
51 * drops out of a stripe volume. 49 * drops out of a stripe volume.
52 */ 50 */
53static void trigger_event(struct work_struct *work) 51static void trigger_event(struct work_struct *work)
54{ 52{
55 struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); 53 struct stripe_c *sc = container_of(work, struct stripe_c,
56 54 trigger_event);
57 dm_table_event(sc->ti->table); 55 dm_table_event(sc->ti->table);
58
59} 56}
60 57
61static inline struct stripe_c *alloc_context(unsigned int stripes) 58static inline struct stripe_c *alloc_context(unsigned int stripes)
@@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
160 return -ENOMEM; 157 return -ENOMEM;
161 } 158 }
162 159
163 INIT_WORK(&sc->kstriped_ws, trigger_event); 160 INIT_WORK(&sc->trigger_event, trigger_event);
164 161
165 /* Set pointer to dm target; used in trigger_event */ 162 /* Set pointer to dm target; used in trigger_event */
166 sc->ti = ti; 163 sc->ti = ti;
@@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti)
211 for (i = 0; i < sc->stripes; i++) 208 for (i = 0; i < sc->stripes; i++)
212 dm_put_device(ti, sc->stripe[i].dev); 209 dm_put_device(ti, sc->stripe[i].dev);
213 210
214 flush_workqueue(kstriped); 211 flush_work_sync(&sc->trigger_event);
215 kfree(sc); 212 kfree(sc);
216} 213}
217 214
@@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
367 atomic_inc(&(sc->stripe[i].error_count)); 364 atomic_inc(&(sc->stripe[i].error_count));
368 if (atomic_read(&(sc->stripe[i].error_count)) < 365 if (atomic_read(&(sc->stripe[i].error_count)) <
369 DM_IO_ERROR_THRESHOLD) 366 DM_IO_ERROR_THRESHOLD)
370 queue_work(kstriped, &sc->kstriped_ws); 367 schedule_work(&sc->trigger_event);
371 } 368 }
372 369
373 return error; 370 return error;
@@ -401,7 +398,7 @@ static void stripe_io_hints(struct dm_target *ti,
401 398
402static struct target_type stripe_target = { 399static struct target_type stripe_target = {
403 .name = "striped", 400 .name = "striped",
404 .version = {1, 3, 0}, 401 .version = {1, 3, 1},
405 .module = THIS_MODULE, 402 .module = THIS_MODULE,
406 .ctr = stripe_ctr, 403 .ctr = stripe_ctr,
407 .dtr = stripe_dtr, 404 .dtr = stripe_dtr,
@@ -422,20 +419,10 @@ int __init dm_stripe_init(void)
422 return r; 419 return r;
423 } 420 }
424 421
425 kstriped = create_singlethread_workqueue("kstriped");
426 if (!kstriped) {
427 DMERR("failed to create workqueue kstriped");
428 dm_unregister_target(&stripe_target);
429 return -ENOMEM;
430 }
431
432 return r; 422 return r;
433} 423}
434 424
435void dm_stripe_exit(void) 425void dm_stripe_exit(void)
436{ 426{
437 dm_unregister_target(&stripe_target); 427 dm_unregister_target(&stripe_target);
438 destroy_workqueue(kstriped);
439
440 return;
441} 428}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 985c20a4f30e..dffa0ac7c4f0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -71,6 +71,8 @@ struct dm_table {
71 void *event_context; 71 void *event_context;
72 72
73 struct dm_md_mempools *mempools; 73 struct dm_md_mempools *mempools;
74
75 struct list_head target_callbacks;
74}; 76};
75 77
76/* 78/*
@@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
204 return -ENOMEM; 206 return -ENOMEM;
205 207
206 INIT_LIST_HEAD(&t->devices); 208 INIT_LIST_HEAD(&t->devices);
209 INIT_LIST_HEAD(&t->target_callbacks);
207 atomic_set(&t->holders, 0); 210 atomic_set(&t->holders, 0);
208 t->discards_supported = 1; 211 t->discards_supported = 1;
209 212
@@ -1225,10 +1228,17 @@ int dm_table_resume_targets(struct dm_table *t)
1225 return 0; 1228 return 0;
1226} 1229}
1227 1230
1231void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
1232{
1233 list_add(&cb->list, &t->target_callbacks);
1234}
1235EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
1236
1228int dm_table_any_congested(struct dm_table *t, int bdi_bits) 1237int dm_table_any_congested(struct dm_table *t, int bdi_bits)
1229{ 1238{
1230 struct dm_dev_internal *dd; 1239 struct dm_dev_internal *dd;
1231 struct list_head *devices = dm_table_get_devices(t); 1240 struct list_head *devices = dm_table_get_devices(t);
1241 struct dm_target_callbacks *cb;
1232 int r = 0; 1242 int r = 0;
1233 1243
1234 list_for_each_entry(dd, devices, list) { 1244 list_for_each_entry(dd, devices, list) {
@@ -1243,6 +1253,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
1243 bdevname(dd->dm_dev.bdev, b)); 1253 bdevname(dd->dm_dev.bdev, b));
1244 } 1254 }
1245 1255
1256 list_for_each_entry(cb, &t->target_callbacks, list)
1257 if (cb->congested_fn)
1258 r |= cb->congested_fn(cb, bdi_bits);
1259
1246 return r; 1260 return r;
1247} 1261}
1248 1262
@@ -1264,6 +1278,7 @@ void dm_table_unplug_all(struct dm_table *t)
1264{ 1278{
1265 struct dm_dev_internal *dd; 1279 struct dm_dev_internal *dd;
1266 struct list_head *devices = dm_table_get_devices(t); 1280 struct list_head *devices = dm_table_get_devices(t);
1281 struct dm_target_callbacks *cb;
1267 1282
1268 list_for_each_entry(dd, devices, list) { 1283 list_for_each_entry(dd, devices, list) {
1269 struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); 1284 struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
@@ -1276,6 +1291,10 @@ void dm_table_unplug_all(struct dm_table *t)
1276 dm_device_name(t->md), 1291 dm_device_name(t->md),
1277 bdevname(dd->dm_dev.bdev, b)); 1292 bdevname(dd->dm_dev.bdev, b));
1278 } 1293 }
1294
1295 list_for_each_entry(cb, &t->target_callbacks, list)
1296 if (cb->unplug_fn)
1297 cb->unplug_fn(cb);
1279} 1298}
1280 1299
1281struct mapped_device *dm_table_get_md(struct dm_table *t) 1300struct mapped_device *dm_table_get_md(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f48a2f359ac4..eaa3af0e0632 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -32,7 +32,6 @@
32#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 32#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
33#define DM_COOKIE_LENGTH 24 33#define DM_COOKIE_LENGTH 24
34 34
35static DEFINE_MUTEX(dm_mutex);
36static const char *_name = DM_NAME; 35static const char *_name = DM_NAME;
37 36
38static unsigned int major = 0; 37static unsigned int major = 0;
@@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
328{ 327{
329 struct mapped_device *md; 328 struct mapped_device *md;
330 329
331 mutex_lock(&dm_mutex);
332 spin_lock(&_minor_lock); 330 spin_lock(&_minor_lock);
333 331
334 md = bdev->bd_disk->private_data; 332 md = bdev->bd_disk->private_data;
@@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
346 344
347out: 345out:
348 spin_unlock(&_minor_lock); 346 spin_unlock(&_minor_lock);
349 mutex_unlock(&dm_mutex);
350 347
351 return md ? 0 : -ENXIO; 348 return md ? 0 : -ENXIO;
352} 349}
@@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
355{ 352{
356 struct mapped_device *md = disk->private_data; 353 struct mapped_device *md = disk->private_data;
357 354
358 mutex_lock(&dm_mutex); 355 spin_lock(&_minor_lock);
356
359 atomic_dec(&md->open_count); 357 atomic_dec(&md->open_count);
360 dm_put(md); 358 dm_put(md);
361 mutex_unlock(&dm_mutex); 359
360 spin_unlock(&_minor_lock);
362 361
363 return 0; 362 return 0;
364} 363}
@@ -1638,13 +1637,15 @@ static void dm_request_fn(struct request_queue *q)
1638 if (map_request(ti, clone, md)) 1637 if (map_request(ti, clone, md))
1639 goto requeued; 1638 goto requeued;
1640 1639
1641 spin_lock_irq(q->queue_lock); 1640 BUG_ON(!irqs_disabled());
1641 spin_lock(q->queue_lock);
1642 } 1642 }
1643 1643
1644 goto out; 1644 goto out;
1645 1645
1646requeued: 1646requeued:
1647 spin_lock_irq(q->queue_lock); 1647 BUG_ON(!irqs_disabled());
1648 spin_lock(q->queue_lock);
1648 1649
1649plug_and_out: 1650plug_and_out:
1650 if (!elv_queue_empty(q)) 1651 if (!elv_queue_empty(q))
@@ -1884,7 +1885,8 @@ static struct mapped_device *alloc_dev(int minor)
1884 add_disk(md->disk); 1885 add_disk(md->disk);
1885 format_dev_t(md->name, MKDEV(_major, minor)); 1886 format_dev_t(md->name, MKDEV(_major, minor));
1886 1887
1887 md->wq = create_singlethread_workqueue("kdmflush"); 1888 md->wq = alloc_workqueue("kdmflush",
1889 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
1888 if (!md->wq) 1890 if (!md->wq)
1889 goto bad_thread; 1891 goto bad_thread;
1890 1892
@@ -1992,13 +1994,14 @@ static void event_callback(void *context)
1992 wake_up(&md->eventq); 1994 wake_up(&md->eventq);
1993} 1995}
1994 1996
1997/*
1998 * Protected by md->suspend_lock obtained by dm_swap_table().
1999 */
1995static void __set_size(struct mapped_device *md, sector_t size) 2000static void __set_size(struct mapped_device *md, sector_t size)
1996{ 2001{
1997 set_capacity(md->disk, size); 2002 set_capacity(md->disk, size);
1998 2003
1999 mutex_lock(&md->bdev->bd_inode->i_mutex);
2000 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2004 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2001 mutex_unlock(&md->bdev->bd_inode->i_mutex);
2002} 2005}
2003 2006
2004/* 2007/*
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 2970022faa63..272496d1fae4 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -193,6 +193,13 @@ struct dm_target {
193 char *error; 193 char *error;
194}; 194};
195 195
196/* Each target can link one of these into the table */
197struct dm_target_callbacks {
198 struct list_head list;
199 int (*congested_fn) (struct dm_target_callbacks *, int);
200 void (*unplug_fn)(struct dm_target_callbacks *);
201};
202
196int dm_register_target(struct target_type *t); 203int dm_register_target(struct target_type *t);
197void dm_unregister_target(struct target_type *t); 204void dm_unregister_target(struct target_type *t);
198 205
@@ -269,6 +276,11 @@ int dm_table_add_target(struct dm_table *t, const char *type,
269 sector_t start, sector_t len, char *params); 276 sector_t start, sector_t len, char *params);
270 277
271/* 278/*
279 * Target_ctr should call this if it needs to add any callbacks.
280 */
281void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb);
282
283/*
272 * Finally call this to make the table ready for use. 284 * Finally call this to make the table ready for use.
273 */ 285 */
274int dm_table_complete(struct dm_table *t); 286int dm_table_complete(struct dm_table *t);
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 49eab360d5d4..78bbf47bbb96 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -44,7 +44,7 @@
44 * Remove a device, destroy any tables. 44 * Remove a device, destroy any tables.
45 * 45 *
46 * DM_DEV_RENAME: 46 * DM_DEV_RENAME:
47 * Rename a device. 47 * Rename a device or set its uuid if none was previously supplied.
48 * 48 *
49 * DM_SUSPEND: 49 * DM_SUSPEND:
50 * This performs both suspend and resume, depending which flag is 50 * This performs both suspend and resume, depending which flag is
@@ -267,9 +267,9 @@ enum {
267#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 267#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
268 268
269#define DM_VERSION_MAJOR 4 269#define DM_VERSION_MAJOR 4
270#define DM_VERSION_MINOR 18 270#define DM_VERSION_MINOR 19
271#define DM_VERSION_PATCHLEVEL 0 271#define DM_VERSION_PATCHLEVEL 1
272#define DM_VERSION_EXTRA "-ioctl (2010-06-29)" 272#define DM_VERSION_EXTRA "-ioctl (2011-01-07)"
273 273
274/* Status bits */ 274/* Status bits */
275#define DM_READONLY_FLAG (1 << 0) /* In/Out */ 275#define DM_READONLY_FLAG (1 << 0) /* In/Out */
@@ -322,4 +322,10 @@ enum {
322 */ 322 */
323#define DM_UEVENT_GENERATED_FLAG (1 << 13) /* Out */ 323#define DM_UEVENT_GENERATED_FLAG (1 << 13) /* Out */
324 324
325/*
326 * If set, rename changes the uuid not the name. Only permitted
327 * if no uuid was previously supplied: an existing uuid cannot be changed.
328 */
329#define DM_UUID_FLAG (1 << 14) /* In */
330
325#endif /* _LINUX_DM_IOCTL_H */ 331#endif /* _LINUX_DM_IOCTL_H */
diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h
index 0c3c3a2110c4..eeace7d3ff15 100644
--- a/include/linux/dm-log-userspace.h
+++ b/include/linux/dm-log-userspace.h
@@ -370,6 +370,16 @@
370#define DM_ULOG_REQUEST_TYPE(request_type) \ 370#define DM_ULOG_REQUEST_TYPE(request_type) \
371 (DM_ULOG_REQUEST_MASK & (request_type)) 371 (DM_ULOG_REQUEST_MASK & (request_type))
372 372
373/*
374 * DM_ULOG_REQUEST_VERSION is incremented when there is a
375 * change to the way information is passed between kernel
376 * and userspace. This could be a structure change of
377 * dm_ulog_request or a change in the way requests are
378 * issued/handled. Changes are outlined here:
379 * version 1: Initial implementation
380 */
381#define DM_ULOG_REQUEST_VERSION 1
382
373struct dm_ulog_request { 383struct dm_ulog_request {
374 /* 384 /*
375 * The local unique identifier (luid) and the universally unique 385 * The local unique identifier (luid) and the universally unique
@@ -383,8 +393,9 @@ struct dm_ulog_request {
383 */ 393 */
384 uint64_t luid; 394 uint64_t luid;
385 char uuid[DM_UUID_LEN]; 395 char uuid[DM_UUID_LEN];
386 char padding[7]; /* Padding because DM_UUID_LEN = 129 */ 396 char padding[3]; /* Padding because DM_UUID_LEN = 129 */
387 397
398 uint32_t version; /* See DM_ULOG_REQUEST_VERSION */
388 int32_t error; /* Used to report back processing errors */ 399 int32_t error; /* Used to report back processing errors */
389 400
390 uint32_t seq; /* Sequence number for request */ 401 uint32_t seq; /* Sequence number for request */