Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm

* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (32 commits) dm: raid456 basic support dm: per target unplug callback support dm: introduce target callbacks and congestion callback dm mpath: delay activate_path retry on SCSI_DH_RETRY dm: remove superfluous irq disablement in dm_request_fn dm log: use PTR_ERR value instead of ENOMEM dm snapshot: avoid storing private suspended state dm snapshot: persistent make metadata_wq multithreaded dm: use non reentrant workqueues if equivalent dm: convert workqueues to alloc_ordered dm stripe: switch from local workqueue to system_wq dm: dont use flush_scheduled_work dm snapshot: remove unused dm_snapshot queued_bios_work dm ioctl: suppress needless warning messages dm crypt: add loop aes iv generator dm crypt: add multi key capability dm crypt: add post iv call to iv generator dm crypt: use io thread for reads only if mempool exhausted dm crypt: scale to multiple cpus dm crypt: simplify compatible table output ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2011-01-13 20:30:47 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-01-13 20:30:47 -0500
commit: f6bcfd94c0a97c11ce9197ade93a08bc8af6e057 (patch)
tree: 83d867565b4f2a7627b3288f9e000eaf2b217be9 /drivers/md
parent: 509e4aef44eb10e4aef1f81c3c3ff1214671503b (diff)
parent: 9d09e663d5502c46f2d9481c04c1087e1c2da698 (diff)
17 files changed, 1581 insertions, 292 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf1a95e31559..98d9ec85e0eb 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,30 @@ config DM_MIRROR
         Allow volume managers to mirror logical volumes, also
         needed for live data migration tools such as 'pvmove'.
+config DM_RAID
+       tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       select MD_RAID456
+       select BLK_DEV_MD
+       ---help---
+         A dm target that supports RAID4, RAID5 and RAID6 mappings
+         A RAID-5 set of N drives with a capacity of C MB per drive provides
+         the capacity of C * (N - 1) MB, and protects against a failure
+         of a single drive. For a given sector (row) number, (N - 1) drives
+         contain data sectors, and one drive contains the parity protection.
+         For a RAID-4 set, the parity blocks are present on a single drive,
+         while a RAID-5 set distributes the parity across the drives in one
+         of the available parity distribution methods.
+         A RAID-6 set of N drives with a capacity of C MB per drive
+         provides the capacity of C * (N - 2) MB, and protects
+         against a failure of any two drives. For a given sector
+         (row) number, (N - 2) drives contain data sectors, and two
+         drives contains two independent redundancy syndromes.  Like
+         RAID-5, RAID-6 distributes the syndromes across the drives
+         in one of the available parity distribution methods.
 config DM_LOG_USERSPACE
        tristate "Mirror userspace logging (EXPERIMENTAL)"
        depends on DM_MIRROR && EXPERIMENTAL && NET
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5e3aac41919d..d0138606c2e8 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)         += dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_LOG_USERSPACE)  += dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)           += dm-zero.o
+obj-$(CONFIG_DM_RAID)   += dm-raid.o
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs                     += dm-uevent.o
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d5b0e4c0e702..4e054bd91664 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,10 +18,14 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
+#include <linux/percpu.h>
 #include <asm/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
 #include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/md5.h>
+#include <crypto/algapi.h>
 #include <linux/device-mapper.h>
@@ -63,6 +67,7 @@ struct dm_crypt_request {
        struct convert_context *ctx;
        struct scatterlist sg_in;
        struct scatterlist sg_out;
+        sector_t iv_sector;
 };
 struct crypt_config;
@@ -73,11 +78,13 @@ struct crypt_iv_operations {
        void (*dtr)(struct crypt_config *cc);
        int (*init)(struct crypt_config *cc);
        int (*wipe)(struct crypt_config *cc);
-        int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
+        int (*generator)(struct crypt_config *cc, u8 *iv,
+                         struct dm_crypt_request *dmreq);
+        int (*post)(struct crypt_config *cc, u8 *iv,
+                    struct dm_crypt_request *dmreq);
 };
 struct iv_essiv_private {
-        struct crypto_cipher *tfm;
        struct crypto_hash *hash_tfm;
        u8 *salt;
 };
@@ -86,11 +93,32 @@ struct iv_benbi_private {
        int shift;
 };
+#define LMK_SEED_SIZE 64 /* hash + 0 */
+struct iv_lmk_private {
+        struct crypto_shash *hash_tfm;
+        u8 *seed;
+};
 /*
 * Crypt: maps a linear range of a block device
 * and encrypts / decrypts at the same time.
 */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+/*
+ * Duplicated per-CPU state for cipher.
+ */
+struct crypt_cpu {
+        struct ablkcipher_request *req;
+        /* ESSIV: struct crypto_cipher *essiv_tfm */
+        void *iv_private;
+        struct crypto_ablkcipher *tfms[0];
+};
+/*
+ * The fields in here must be read only after initialization,
+ * changing state should be in crypt_cpu.
+ */
 struct crypt_config {
        struct dm_dev *dev;
        sector_t start;
@@ -108,17 +136,25 @@ struct crypt_config {
        struct workqueue_struct *crypt_queue;
        char *cipher;
-        char *cipher_mode;
+        char *cipher_string;
        struct crypt_iv_operations *iv_gen_ops;
        union {
                struct iv_essiv_private essiv;
                struct iv_benbi_private benbi;
+                struct iv_lmk_private lmk;
        } iv_gen_private;
        sector_t iv_offset;
        unsigned int iv_size;
        /*
+         * Duplicated per cpu state. Access through
+         * per_cpu_ptr() only.
+         */
+        struct crypt_cpu __percpu *cpu;
+        unsigned tfms_count;
+        /*
         * Layout of each crypto request:
         *
         *   struct ablkcipher_request
@@ -132,11 +168,10 @@ struct crypt_config {
         * correctly aligned.
         */
        unsigned int dmreq_start;
-        struct ablkcipher_request *req;
-        struct crypto_ablkcipher *tfm;
        unsigned long flags;
        unsigned int key_size;
+        unsigned int key_parts;
        u8 key[0];
 };
@@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool;
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
+static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
+static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
+{
+        return this_cpu_ptr(cc->cpu);
+}
+/*
+ * Use this to access cipher attributes that are the same for each CPU.
+ */
+static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
+{
+        return __this_cpu_ptr(cc->cpu)->tfms[0];
+}
 /*
 * Different IV generation algorithms:
@@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 * null: the initial vector is always zero.  Provides compatibility with
 *       obsolete loop_fish2 devices.  Do not use for new devices.
 *
+ * lmk:  Compatible implementation of the block chaining mode used
+ *       by the Loop-AES block device encryption system
+ *       designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
+ *       It operates on full 512 byte sectors and uses CBC
+ *       with an IV derived from the sector number, the data and
+ *       optionally extra IV seed.
+ *       This means that after decryption the first block
+ *       of sector must be tweaked according to decrypted data.
+ *       Loop-AES can use three encryption schemes:
+ *         version 1: is plain aes-cbc mode
+ *         version 2: uses 64 multikey scheme with lmk IV generator
+ *         version 3: the same as version 2 with additional IV seed
+ *                   (it uses 65 keys, last key is used as IV seed)
+ *
 * plumb: unimplemented, see:
 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
 */
-static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
+                              struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
-        *(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
+        *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
        return 0;
 }
 static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
-                                sector_t sector)
+                                struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
-        *(u64 *)iv = cpu_to_le64(sector);
+        *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
        return 0;
 }
@@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
        struct hash_desc desc;
        struct scatterlist sg;
-        int err;
+        struct crypto_cipher *essiv_tfm;
+        int err, cpu;
        sg_init_one(&sg, cc->key, cc->key_size);
        desc.tfm = essiv->hash_tfm;
@@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
        if (err)
                return err;
-        return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+        for_each_possible_cpu(cpu) {
+                essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
+                err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
                                    crypto_hash_digestsize(essiv->hash_tfm));
+                if (err)
+                        return err;
+        }
+        return 0;
 }
 /* Wipe salt and reset key derived from volume key */
@@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 {
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
        unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+        struct crypto_cipher *essiv_tfm;
+        int cpu, r, err = 0;
        memset(essiv->salt, 0, salt_size);
-        return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+        for_each_possible_cpu(cpu) {
+                essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
+                r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
+                if (r)
+                        err = r;
+        }
+        return err;
+}
+/* Set up per cpu cipher state */
+static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
+                                             struct dm_target *ti,
+                                             u8 *salt, unsigned saltsize)
+{
+        struct crypto_cipher *essiv_tfm;
+        int err;
+        /* Setup the essiv_tfm with the given salt */
+        essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+        if (IS_ERR(essiv_tfm)) {
+                ti->error = "Error allocating crypto tfm for ESSIV";
+                return essiv_tfm;
+        }
+        if (crypto_cipher_blocksize(essiv_tfm) !=
+            crypto_ablkcipher_ivsize(any_tfm(cc))) {
+                ti->error = "Block size of ESSIV cipher does "
+                            "not match IV size of block cipher";
+                crypto_free_cipher(essiv_tfm);
+                return ERR_PTR(-EINVAL);
+        }
+        err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
+        if (err) {
+                ti->error = "Failed to set key for ESSIV cipher";
+                crypto_free_cipher(essiv_tfm);
+                return ERR_PTR(err);
+        }
+        return essiv_tfm;
 }
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
+        int cpu;
+        struct crypt_cpu *cpu_cc;
+        struct crypto_cipher *essiv_tfm;
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
-        crypto_free_cipher(essiv->tfm);
-        essiv->tfm = NULL;
        crypto_free_hash(essiv->hash_tfm);
        essiv->hash_tfm = NULL;
        kzfree(essiv->salt);
        essiv->salt = NULL;
+        for_each_possible_cpu(cpu) {
+                cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+                essiv_tfm = cpu_cc->iv_private;
+                if (essiv_tfm)
+                        crypto_free_cipher(essiv_tfm);
+                cpu_cc->iv_private = NULL;
+        }
 }
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
        struct crypto_cipher *essiv_tfm = NULL;
        struct crypto_hash *hash_tfm = NULL;
        u8 *salt = NULL;
-        int err;
+        int err, cpu;
        if (!opts) {
                ti->error = "Digest algorithm missing for ESSIV mode";
@@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
                goto bad;
        }
-        /* Allocate essiv_tfm */
-        essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
-        if (IS_ERR(essiv_tfm)) {
-                ti->error = "Error allocating crypto tfm for ESSIV";
-                err = PTR_ERR(essiv_tfm);
-                goto bad;
-        }
-        if (crypto_cipher_blocksize(essiv_tfm) !=
-            crypto_ablkcipher_ivsize(cc->tfm)) {
-                ti->error = "Block size of ESSIV cipher does "
-                            "not match IV size of block cipher";
-                err = -EINVAL;
-                goto bad;
-        }
        cc->iv_gen_private.essiv.salt = salt;
-        cc->iv_gen_private.essiv.tfm = essiv_tfm;
        cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
+        for_each_possible_cpu(cpu) {
+                essiv_tfm = setup_essiv_cpu(cc, ti, salt,
+                                        crypto_hash_digestsize(hash_tfm));
+                if (IS_ERR(essiv_tfm)) {
+                        crypt_iv_essiv_dtr(cc);
+                        return PTR_ERR(essiv_tfm);
+                }
+                per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
+        }
        return 0;
 bad:
-        if (essiv_tfm && !IS_ERR(essiv_tfm))
-                crypto_free_cipher(essiv_tfm);
        if (hash_tfm && !IS_ERR(hash_tfm))
                crypto_free_hash(hash_tfm);
        kfree(salt);
        return err;
 }
-static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
+                              struct dm_crypt_request *dmreq)
 {
+        struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
        memset(iv, 0, cc->iv_size);
-        *(u64 *)iv = cpu_to_le64(sector);
+        *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
-        crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
+        crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
        return 0;
 }
 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
                              const char *opts)
 {
-        unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
+        unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
        int log = ilog2(bs);
        /* we need to calculate how far we must shift the sector count
@@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
 {
 }
-static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
+                              struct dm_crypt_request *dmreq)
 {
        __be64 val;
        memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
-        val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
+        val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
        put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
        return 0;
 }
-static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
+                             struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
        return 0;
 }
+static void crypt_iv_lmk_dtr(struct crypt_config *cc)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
+                crypto_free_shash(lmk->hash_tfm);
+        lmk->hash_tfm = NULL;
+        kzfree(lmk->seed);
+        lmk->seed = NULL;
+}
+static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
+                            const char *opts)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
+        if (IS_ERR(lmk->hash_tfm)) {
+                ti->error = "Error initializing LMK hash";
+                return PTR_ERR(lmk->hash_tfm);
+        }
+        /* No seed in LMK version 2 */
+        if (cc->key_parts == cc->tfms_count) {
+                lmk->seed = NULL;
+                return 0;
+        }
+        lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
+        if (!lmk->seed) {
+                crypt_iv_lmk_dtr(cc);
+                ti->error = "Error kmallocing seed storage in LMK";
+                return -ENOMEM;
+        }
+        return 0;
+}
+static int crypt_iv_lmk_init(struct crypt_config *cc)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        int subkey_size = cc->key_size / cc->key_parts;
+        /* LMK seed is on the position of LMK_KEYS + 1 key */
+        if (lmk->seed)
+                memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
+                       crypto_shash_digestsize(lmk->hash_tfm));
+        return 0;
+}
+static int crypt_iv_lmk_wipe(struct crypt_config *cc)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        if (lmk->seed)
+                memset(lmk->seed, 0, LMK_SEED_SIZE);
+        return 0;
+}
+static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
+                            struct dm_crypt_request *dmreq,
+                            u8 *data)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        struct {
+                struct shash_desc desc;
+                char ctx[crypto_shash_descsize(lmk->hash_tfm)];
+        } sdesc;
+        struct md5_state md5state;
+        u32 buf[4];
+        int i, r;
+        sdesc.desc.tfm = lmk->hash_tfm;
+        sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        r = crypto_shash_init(&sdesc.desc);
+        if (r)
+                return r;
+        if (lmk->seed) {
+                r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
+                if (r)
+                        return r;
+        }
+        /* Sector is always 512B, block size 16, add data of blocks 1-31 */
+        r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
+        if (r)
+                return r;
+        /* Sector is cropped to 56 bits here */
+        buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
+        buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
+        buf[2] = cpu_to_le32(4024);
+        buf[3] = 0;
+        r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
+        if (r)
+                return r;
+        /* No MD5 padding here */
+        r = crypto_shash_export(&sdesc.desc, &md5state);
+        if (r)
+                return r;
+        for (i = 0; i < MD5_HASH_WORDS; i++)
+                __cpu_to_le32s(&md5state.hash[i]);
+        memcpy(iv, &md5state.hash, cc->iv_size);
+        return 0;
+}
+static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
+                            struct dm_crypt_request *dmreq)
+{
+        u8 *src;
+        int r = 0;
+        if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+                src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
+                r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
+                kunmap_atomic(src, KM_USER0);
+        } else
+                memset(iv, 0, cc->iv_size);
+        return r;
+}
+static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
+                             struct dm_crypt_request *dmreq)
+{
+        u8 *dst;
+        int r;
+        if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
+                return 0;
+        dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
+        r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
+        /* Tweak the first block of plaintext sector */
+        if (!r)
+                crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
+        kunmap_atomic(dst, KM_USER0);
+        return r;
+}
 static struct crypt_iv_operations crypt_iv_plain_ops = {
        .generator = crypt_iv_plain_gen
 };
@@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
        .generator = crypt_iv_null_gen
 };
+static struct crypt_iv_operations crypt_iv_lmk_ops = {
+        .ctr       = crypt_iv_lmk_ctr,
+        .dtr       = crypt_iv_lmk_dtr,
+        .init      = crypt_iv_lmk_init,
+        .wipe      = crypt_iv_lmk_wipe,
+        .generator = crypt_iv_lmk_gen,
+        .post      = crypt_iv_lmk_post
+};
 static void crypt_convert_init(struct crypt_config *cc,
                               struct convert_context *ctx,
                               struct bio *bio_out, struct bio *bio_in,
@@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
        return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
 }
+static u8 *iv_of_dmreq(struct crypt_config *cc,
+                       struct dm_crypt_request *dmreq)
+{
+        return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+                crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
+}
 static int crypt_convert_block(struct crypt_config *cc,
                               struct convert_context *ctx,
                               struct ablkcipher_request *req)
@@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc,
        int r = 0;
        dmreq = dmreq_of_req(cc, req);
-        iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
+        iv = iv_of_dmreq(cc, dmreq);
-                         crypto_ablkcipher_alignmask(cc->tfm) + 1);
+        dmreq->iv_sector = ctx->sector;
        dmreq->ctx = ctx;
        sg_init_table(&dmreq->sg_in, 1);
        sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc,
        }
        if (cc->iv_gen_ops) {
-                r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
+                r = cc->iv_gen_ops->generator(cc, iv, dmreq);
                if (r < 0)
                        return r;
        }
@@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc,
        else
                r = crypto_ablkcipher_decrypt(req);
+        if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+                r = cc->iv_gen_ops->post(cc, iv, dmreq);
        return r;
 }
 static void kcryptd_async_done(struct crypto_async_request *async_req,
                               int error);
 static void crypt_alloc_req(struct crypt_config *cc,
                            struct convert_context *ctx)
 {
-        if (!cc->req)
+        struct crypt_cpu *this_cc = this_crypt_config(cc);
-                cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+        unsigned key_index = ctx->sector & (cc->tfms_count - 1);
-        ablkcipher_request_set_tfm(cc->req, cc->tfm);
-        ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+        if (!this_cc->req)
-                                        CRYPTO_TFM_REQ_MAY_SLEEP,
+                this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
-                                        kcryptd_async_done,
-                                        dmreq_of_req(cc, cc->req));
+        ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]);
+        ablkcipher_request_set_callback(this_cc->req,
+            CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+            kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
 }
 /*
@@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
 static int crypt_convert(struct crypt_config *cc,
                         struct convert_context *ctx)
 {
+        struct crypt_cpu *this_cc = this_crypt_config(cc);
        int r;
        atomic_set(&ctx->pending, 1);
@@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc,
                atomic_inc(&ctx->pending);
-                r = crypt_convert_block(cc, ctx, cc->req);
+                r = crypt_convert_block(cc, ctx, this_cc->req);
                switch (r) {
                /* async */
@@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc,
                        INIT_COMPLETION(ctx->restart);
                        /* fall through*/
                case -EINPROGRESS:
-                        cc->req = NULL;
+                        this_cc->req = NULL;
                        ctx->sector++;
                        continue;
@@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 * They must be separated as otherwise the final stages could be
 * starved by new requests which can block in the first stages due
 * to memory allocation.
+ *
+ * The work is done per CPU global for all dm-crypt instances.
+ * They should not depend on each other and do not block.
 */
 static void crypt_endio(struct bio *clone, int error)
 {
@@ -691,26 +991,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
        clone->bi_destructor = dm_crypt_bio_destructor;
 }
-static void kcryptd_io_read(struct dm_crypt_io *io)
+static void kcryptd_unplug(struct crypt_config *cc)
+{
+        blk_unplug(bdev_get_queue(cc->dev->bdev));
+}
+static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
        struct crypt_config *cc = io->target->private;
        struct bio *base_bio = io->base_bio;
        struct bio *clone;
-        crypt_inc_pending(io);
        /*
         * The block layer might modify the bvec array, so always
         * copy the required bvecs because we need the original
         * one in order to decrypt the whole bio data *afterwards*.
         */
-        clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
+        clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
-        if (unlikely(!clone)) {
+        if (!clone) {
-                io->error = -ENOMEM;
+                kcryptd_unplug(cc);
-                crypt_dec_pending(io);
+                return 1;
-                return;
        }
+        crypt_inc_pending(io);
        clone_init(io, clone);
        clone->bi_idx = 0;
        clone->bi_vcnt = bio_segments(base_bio);
@@ -720,6 +1024,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
               sizeof(struct bio_vec) * clone->bi_vcnt);
        generic_make_request(clone);
+        return 0;
 }
 static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -732,9 +1037,12 @@ static void kcryptd_io(struct work_struct *work)
 {
        struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
-        if (bio_data_dir(io->base_bio) == READ)
+        if (bio_data_dir(io->base_bio) == READ) {
-                kcryptd_io_read(io);
+                crypt_inc_pending(io);
-        else
+                if (kcryptd_io_read(io, GFP_NOIO))
+                        io->error = -ENOMEM;
+                crypt_dec_pending(io);
+        } else
                kcryptd_io_write(io);
 }
@@ -901,6 +1209,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
                return;
        }
+        if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
+                error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
        mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
        if (!atomic_dec_and_test(&ctx->pending))
@@ -971,34 +1282,84 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
        }
 }
-static int crypt_set_key(struct crypt_config *cc, char *key)
+static void crypt_free_tfms(struct crypt_config *cc, int cpu)
 {
-        unsigned key_size = strlen(key) >> 1;
+        struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+        unsigned i;
-        if (cc->key_size && cc->key_size != key_size)
+        for (i = 0; i < cc->tfms_count; i++)
+                if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
+                        crypto_free_ablkcipher(cpu_cc->tfms[i]);
+                        cpu_cc->tfms[i] = NULL;
+                }
+}
+static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
+{
+        struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+        unsigned i;
+        int err;
+        for (i = 0; i < cc->tfms_count; i++) {
+                cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
+                if (IS_ERR(cpu_cc->tfms[i])) {
+                        err = PTR_ERR(cpu_cc->tfms[i]);
+                        crypt_free_tfms(cc, cpu);
+                        return err;
+                }
+        }
+        return 0;
+}
+static int crypt_setkey_allcpus(struct crypt_config *cc)
+{
+        unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
+        int cpu, err = 0, i, r;
+        for_each_possible_cpu(cpu) {
+                for (i = 0; i < cc->tfms_count; i++) {
+                        r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i],
+                                                     cc->key + (i * subkey_size), subkey_size);
+                        if (r)
+                                err = r;
+                }
+        }
+        return err;
+}
+static int crypt_set_key(struct crypt_config *cc, char *key)
+{
+        /* The key size may not be changed. */
+        if (cc->key_size != (strlen(key) >> 1))
                return -EINVAL;
-        cc->key_size = key_size; /* initial settings */
+        /* Hyphen (which gives a key_size of zero) means there is no key. */
+        if (!cc->key_size && strcmp(key, "-"))
+                return -EINVAL;
-        if ((!key_size && strcmp(key, "-")) ||
+        if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
-           (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
                return -EINVAL;
        set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
-        return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+        return crypt_setkey_allcpus(cc);
 }
 static int crypt_wipe_key(struct crypt_config *cc)
 {
        clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
        memset(&cc->key, 0, cc->key_size * sizeof(u8));
-        return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+        return crypt_setkey_allcpus(cc);
 }
 static void crypt_dtr(struct dm_target *ti)
 {
        struct crypt_config *cc = ti->private;
+        struct crypt_cpu *cpu_cc;
+        int cpu;
        ti->private = NULL;
@@ -1010,6 +1371,14 @@ static void crypt_dtr(struct dm_target *ti)
        if (cc->crypt_queue)
                destroy_workqueue(cc->crypt_queue);
+        if (cc->cpu)
+                for_each_possible_cpu(cpu) {
+                        cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+                        if (cpu_cc->req)
+                                mempool_free(cpu_cc->req, cc->req_pool);
+                        crypt_free_tfms(cc, cpu);
+                }
        if (cc->bs)
                bioset_free(cc->bs);
@@ -1023,14 +1392,14 @@ static void crypt_dtr(struct dm_target *ti)
        if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
                cc->iv_gen_ops->dtr(cc);
-        if (cc->tfm && !IS_ERR(cc->tfm))
-                crypto_free_ablkcipher(cc->tfm);
        if (cc->dev)
                dm_put_device(ti, cc->dev);
+        if (cc->cpu)
+                free_percpu(cc->cpu);
        kzfree(cc->cipher);
-        kzfree(cc->cipher_mode);
+        kzfree(cc->cipher_string);
        /* Must zero key material before freeing */
        kzfree(cc);
@@ -1040,9 +1409,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                            char *cipher_in, char *key)
 {
        struct crypt_config *cc = ti->private;
-        char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
+        char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
        char *cipher_api = NULL;
-        int ret = -EINVAL;
+        int cpu, ret = -EINVAL;
        /* Convert to crypto api definition? */
        if (strchr(cipher_in, '(')) {
@@ -1050,23 +1419,31 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                return -EINVAL;
        }
+        cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+        if (!cc->cipher_string)
+                goto bad_mem;
        /*
         * Legacy dm-crypt cipher specification
-         * cipher-mode-iv:ivopts
+         * cipher[:keycount]-mode-iv:ivopts
         */
        tmp = cipher_in;
-        cipher = strsep(&tmp, "-");
+        keycount = strsep(&tmp, "-");
+        cipher = strsep(&keycount, ":");
+        if (!keycount)
+                cc->tfms_count = 1;
+        else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
+                 !is_power_of_2(cc->tfms_count)) {
+                ti->error = "Bad cipher key count specification";
+                return -EINVAL;
+        }
+        cc->key_parts = cc->tfms_count;
        cc->cipher = kstrdup(cipher, GFP_KERNEL);
        if (!cc->cipher)
                goto bad_mem;
-        if (tmp) {
-                cc->cipher_mode = kstrdup(tmp, GFP_KERNEL);
-                if (!cc->cipher_mode)
-                        goto bad_mem;
-        }
        chainmode = strsep(&tmp, "-");
        ivopts = strsep(&tmp, "-");
        ivmode = strsep(&ivopts, ":");
@@ -1074,10 +1451,19 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        if (tmp)
                DMWARN("Ignoring unexpected additional cipher options");
-        /* Compatibility mode for old dm-crypt mappings */
+        cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) +
+                                 cc->tfms_count * sizeof(*(cc->cpu->tfms)),
+                                 __alignof__(struct crypt_cpu));
+        if (!cc->cpu) {
+                ti->error = "Cannot allocate per cpu state";
+                goto bad_mem;
+        }
+        /*
+         * For compatibility with the original dm-crypt mapping format, if
+         * only the cipher name is supplied, use cbc-plain.
+         */
        if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
-                kfree(cc->cipher_mode);
-                cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL);
                chainmode = "cbc";
                ivmode = "plain";
        }
@@ -1099,11 +1485,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        }
        /* Allocate cipher */
-        cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
+        for_each_possible_cpu(cpu) {
-        if (IS_ERR(cc->tfm)) {
+                ret = crypt_alloc_tfms(cc, cpu, cipher_api);
-                ret = PTR_ERR(cc->tfm);
+                if (ret < 0) {
-                ti->error = "Error allocating crypto tfm";
+                        ti->error = "Error allocating crypto tfm";
-                goto bad;
+                        goto bad;
+                }
        }
        /* Initialize and set key */
@@ -1114,7 +1501,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        }
        /* Initialize IV */
-        cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm);
+        cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
        if (cc->iv_size)
                /* at least a 64 bit sector number should fit in our buffer */
                cc->iv_size = max(cc->iv_size,
@@ -1137,7 +1524,15 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                cc->iv_gen_ops = &crypt_iv_benbi_ops;
        else if (strcmp(ivmode, "null") == 0)
                cc->iv_gen_ops = &crypt_iv_null_ops;
-        else {
+        else if (strcmp(ivmode, "lmk") == 0) {
+                cc->iv_gen_ops = &crypt_iv_lmk_ops;
+                /* Version 2 and 3 is recognised according
+                 * to length of provided multi-key string.
+                 * If present (version 3), last key is used as IV seed.
+                 */
+                if (cc->key_size % cc->key_parts)
+                        cc->key_parts++;
+        } else {
                ret = -EINVAL;
                ti->error = "Invalid IV mode";
                goto bad;
@@ -1194,6 +1589,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                ti->error = "Cannot allocate encryption context";
                return -ENOMEM;
        }
+        cc->key_size = key_size;
        ti->private = cc;
        ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
@@ -1208,9 +1604,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
        cc->dmreq_start = sizeof(struct ablkcipher_request);
-        cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm);
+        cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
        cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
-        cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) &
+        cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
                           ~(crypto_tfm_ctx_alignment() - 1);
        cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1219,7 +1615,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                ti->error = "Cannot allocate crypt request mempool";
                goto bad;
        }
-        cc->req = NULL;
        cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
        if (!cc->page_pool) {
@@ -1252,13 +1647,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        cc->start = tmpll;
        ret = -ENOMEM;
-        cc->io_queue = create_singlethread_workqueue("kcryptd_io");
+        cc->io_queue = alloc_workqueue("kcryptd_io",
+                                       WQ_NON_REENTRANT|
+                                       WQ_MEM_RECLAIM,
+                                       1);
        if (!cc->io_queue) {
                ti->error = "Couldn't create kcryptd io queue";
                goto bad;
        }
-        cc->crypt_queue = create_singlethread_workqueue("kcryptd");
+        cc->crypt_queue = alloc_workqueue("kcryptd",
+                                          WQ_NON_REENTRANT|
+                                          WQ_CPU_INTENSIVE|
+                                          WQ_MEM_RECLAIM,
+                                          1);
        if (!cc->crypt_queue) {
                ti->error = "Couldn't create kcryptd queue";
                goto bad;
@@ -1286,9 +1688,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
        io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
-        if (bio_data_dir(io->base_bio) == READ)
+        if (bio_data_dir(io->base_bio) == READ) {
-                kcryptd_queue_io(io);
+                if (kcryptd_io_read(io, GFP_NOWAIT))
-        else
+                        kcryptd_queue_io(io);
+        } else
                kcryptd_queue_crypt(io);
        return DM_MAPIO_SUBMITTED;
@@ -1306,10 +1709,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
                break;
        case STATUSTYPE_TABLE:
-                if (cc->cipher_mode)
+                DMEMIT("%s ", cc->cipher_string);
-                        DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode);
-                else
-                        DMEMIT("%s ", cc->cipher);
                if (cc->key_size > 0) {
                        if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -1421,7 +1821,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 static struct target_type crypt_target = {
        .name   = "crypt",
-        .version = {1, 7, 0},
+        .version = {1, 10, 0},
        .module = THIS_MODULE,
        .ctr    = crypt_ctr,
        .dtr    = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index baa11912cc94..f18375dcedd9 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -352,7 +352,7 @@ static int __init dm_delay_init(void)
 {
        int r = -ENOMEM;
-        kdelayd_wq = create_workqueue("kdelayd");
+        kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
        if (!kdelayd_wq) {
                DMERR("Couldn't start kdelayd");
                goto bad_queue;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4b54618b4159..6d12775a1061 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -295,19 +295,55 @@ retry:
                DMWARN("remove_all left %d open device(s)", dev_skipped);
 }
+/*
+ * Set the uuid of a hash_cell that isn't already set.
+ */
+static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
+{
+        mutex_lock(&dm_hash_cells_mutex);
+        hc->uuid = new_uuid;
+        mutex_unlock(&dm_hash_cells_mutex);
+        list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
+}
+/*
+ * Changes the name of a hash_cell and returns the old name for
+ * the caller to free.
+ */
+static char *__change_cell_name(struct hash_cell *hc, char *new_name)
+{
+        char *old_name;
+        /*
+         * Rename and move the name cell.
+         */
+        list_del(&hc->name_list);
+        old_name = hc->name;
+        mutex_lock(&dm_hash_cells_mutex);
+        hc->name = new_name;
+        mutex_unlock(&dm_hash_cells_mutex);
+        list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+        return old_name;
+}
 static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
                                            const char *new)
 {
-        char *new_name, *old_name;
+        char *new_data, *old_name = NULL;
        struct hash_cell *hc;
        struct dm_table *table;
        struct mapped_device *md;
+        unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
        /*
         * duplicate new.
         */
-        new_name = kstrdup(new, GFP_KERNEL);
+        new_data = kstrdup(new, GFP_KERNEL);
-        if (!new_name)
+        if (!new_data)
                return ERR_PTR(-ENOMEM);
        down_write(&_hash_lock);
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
        /*
         * Is new free ?
         */
-        hc = __get_name_cell(new);
+        if (change_uuid)
+                hc = __get_uuid_cell(new);
+        else
+                hc = __get_name_cell(new);
        if (hc) {
-                DMWARN("asked to rename to an already-existing name %s -> %s",
+                DMWARN("Unable to change %s on mapped device %s to one that "
+                       "already exists: %s",
+                       change_uuid ? "uuid" : "name",
                       param->name, new);
                dm_put(hc->md);
                up_write(&_hash_lock);
-                kfree(new_name);
+                kfree(new_data);
                return ERR_PTR(-EBUSY);
        }
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
         */
        hc = __get_name_cell(param->name);
        if (!hc) {
-                DMWARN("asked to rename a non-existent device %s -> %s",
+                DMWARN("Unable to rename non-existent device, %s to %s%s",
-                       param->name, new);
+                       param->name, change_uuid ? "uuid " : "", new);
                up_write(&_hash_lock);
-                kfree(new_name);
+                kfree(new_data);
                return ERR_PTR(-ENXIO);
        }
        /*
-         * rename and move the name cell.
+         * Does this device already have a uuid?
         */
-        list_del(&hc->name_list);
+        if (change_uuid && hc->uuid) {
-        old_name = hc->name;
+                DMWARN("Unable to change uuid of mapped device %s to %s "
-        mutex_lock(&dm_hash_cells_mutex);
+                       "because uuid is already set to %s",
-        hc->name = new_name;
+                       param->name, new, hc->uuid);
-        mutex_unlock(&dm_hash_cells_mutex);
+                dm_put(hc->md);
-        list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+                up_write(&_hash_lock);
+                kfree(new_data);
+                return ERR_PTR(-EINVAL);
+        }
+        if (change_uuid)
+                __set_cell_uuid(hc, new_data);
+        else
+                old_name = __change_cell_name(hc, new_data);
        /*
         * Wake up any dm event waiters.
@@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
        hc = __find_device_hash_cell(param);
        if (!hc) {
-                DMWARN("device doesn't appear to be in the dev hash table.");
+                DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
                up_write(&_hash_lock);
                return -ENXIO;
        }
@@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
         */
        r = dm_lock_for_deletion(md);
        if (r) {
-                DMWARN("unable to remove open device %s", hc->name);
+                DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
                up_write(&_hash_lock);
                dm_put(md);
                return r;
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end)
 static int dev_rename(struct dm_ioctl *param, size_t param_size)
 {
        int r;
-        char *new_name = (char *) param + param->data_start;
+        char *new_data = (char *) param + param->data_start;
        struct mapped_device *md;
+        unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
-        if (new_name < param->data ||
+        if (new_data < param->data ||
-            invalid_str(new_name, (void *) param + param_size) ||
+            invalid_str(new_data, (void *) param + param_size) ||
-            strlen(new_name) > DM_NAME_LEN - 1) {
+            strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
-                DMWARN("Invalid new logical volume name supplied.");
+                DMWARN("Invalid new mapped device name or uuid string supplied.");
                return -EINVAL;
        }
-        r = check_name(new_name);
+        if (!change_uuid) {
-        if (r)
+                r = check_name(new_data);
-                return r;
+                if (r)
+                        return r;
+        }
-        md = dm_hash_rename(param, new_name);
+        md = dm_hash_rename(param, new_data);
        if (IS_ERR(md))
                return PTR_ERR(md);
@@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param)
        hc = __find_device_hash_cell(param);
        if (!hc) {
-                DMWARN("device doesn't appear to be in the dev hash table.");
+                DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
                up_write(&_hash_lock);
                return -ENXIO;
        }
@@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
        hc = __find_device_hash_cell(param);
        if (!hc) {
-                DMWARN("device doesn't appear to be in the dev hash table.");
+                DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
                up_write(&_hash_lock);
                return -ENXIO;
        }
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d8587bac5682..924f5f0084c2 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,6 +37,13 @@ struct dm_kcopyd_client {
        unsigned int nr_pages;
        unsigned int nr_free_pages;
+        /*
+         * Block devices to unplug.
+         * Non-NULL pointer means that a block device has some pending requests
+         * and needs to be unplugged.
+         */
+        struct block_device *unplug[2];
        struct dm_io_client *io_client;
        wait_queue_head_t destroyq;
@@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job)
        return 0;
 }
+/*
+ * Unplug the block device at the specified index.
+ */
+static void unplug(struct dm_kcopyd_client *kc, int rw)
+{
+        if (kc->unplug[rw] != NULL) {
+                blk_unplug(bdev_get_queue(kc->unplug[rw]));
+                kc->unplug[rw] = NULL;
+        }
+}
+/*
+ * Prepare block device unplug. If there's another device
+ * to be unplugged at the same array index, we unplug that
+ * device first.
+ */
+static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
+                           struct block_device *bdev)
+{
+        if (likely(kc->unplug[rw] == bdev))
+                return;
+        unplug(kc, rw);
+        kc->unplug[rw] = bdev;
+}
 static void complete_io(unsigned long error, void *context)
 {
        struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
        int r;
        struct dm_io_request io_req = {
-                .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG,
+                .bi_rw = job->rw,
                .mem.type = DM_IO_PAGE_LIST,
                .mem.ptr.pl = job->pages,
                .mem.offset = job->offset,
@@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job)
                .client = job->kc->io_client,
        };
-        if (job->rw == READ)
+        if (job->rw == READ) {
                r = dm_io(&io_req, 1, &job->source, NULL);
-        else
+                prepare_unplug(job->kc, READ, job->source.bdev);
+        } else {
+                if (job->num_dests > 1)
+                        io_req.bi_rw |= REQ_UNPLUG;
                r = dm_io(&io_req, job->num_dests, job->dests, NULL);
+                if (!(io_req.bi_rw & REQ_UNPLUG))
+                        prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
+        }
        return r;
 }
@@ -435,10 +473,18 @@ static void do_work(struct work_struct *work)
         * Pages jobs when successful will jump onto the io jobs
         * list.  io jobs call wake when they complete and it all
         * starts again.
+         *
+         * Note that io_jobs add block devices to the unplug array,
+         * this array is cleared with "unplug" calls. It is thus
+         * forbidden to run complete_jobs after io_jobs and before
+         * unplug because the block device could be destroyed in
+         * job completion callback.
         */
        process_jobs(&kc->complete_jobs, kc, run_complete_job);
        process_jobs(&kc->pages_jobs, kc, run_pages_job);
        process_jobs(&kc->io_jobs, kc, run_io_job);
+        unplug(kc, READ);
+        unplug(kc, WRITE);
 }
 /*
@@ -619,12 +665,15 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
        INIT_LIST_HEAD(&kc->io_jobs);
        INIT_LIST_HEAD(&kc->pages_jobs);
+        memset(kc->unplug, 0, sizeof(kc->unplug));
        kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
        if (!kc->job_pool)
                goto bad_slab;
        INIT_WORK(&kc->kcopyd_work, do_work);
-        kc->kcopyd_wq = create_singlethread_workqueue("kcopyd");
+        kc->kcopyd_wq = alloc_workqueue("kcopyd",
+                                        WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
        if (!kc->kcopyd_wq)
                goto bad_workqueue;
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 1ed0094f064b..aa2e0c374ab3 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -12,12 +12,22 @@
 #include "dm-log-userspace-transfer.h"
+#define DM_LOG_USERSPACE_VSN "1.1.0"
 struct flush_entry {
        int type;
        region_t region;
        struct list_head list;
 };
+/*
+ * This limit on the number of mark and clear request is, to a degree,
+ * arbitrary.  However, there is some basis for the choice in the limits
+ * imposed on the size of data payload by dm-log-userspace-transfer.c:
+ * dm_consult_userspace().
+ */
+#define MAX_FLUSH_GROUP_COUNT 32
 struct log_c {
        struct dm_target *ti;
        uint32_t region_size;
@@ -37,8 +47,15 @@ struct log_c {
         */
        uint64_t in_sync_hint;
+        /*
+         * Mark and clear requests are held until a flush is issued
+         * so that we can group, and thereby limit, the amount of
+         * network traffic between kernel and userspace.  The 'flush_lock'
+         * is used to protect these lists.
+         */
        spinlock_t flush_lock;
-        struct list_head flush_list;  /* only for clear and mark requests */
+        struct list_head mark_list;
+        struct list_head clear_list;
 };
 static mempool_t *flush_entry_pool;
@@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
        strncpy(lc->uuid, argv[0], DM_UUID_LEN);
        spin_lock_init(&lc->flush_lock);
-        INIT_LIST_HEAD(&lc->flush_list);
+        INIT_LIST_HEAD(&lc->mark_list);
+        INIT_LIST_HEAD(&lc->clear_list);
        str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
        if (str_size < 0) {
@@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
                                 ctr_str, str_size, NULL, NULL);
-        if (r == -ESRCH) {
+        if (r < 0) {
-                DMERR("Userspace log server not found");
+                if (r == -ESRCH)
+                        DMERR("Userspace log server not found");
+                else
+                        DMERR("Userspace log server failed to create log");
                goto out;
        }
@@ -214,10 +235,9 @@ out:
 static void userspace_dtr(struct dm_dirty_log *log)
 {
-        int r;
        struct log_c *lc = log->context;
-        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
+        (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
                                 NULL, 0,
                                 NULL, NULL);
@@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
        return (r) ? 0 : (int)in_sync;
 }
+static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
+{
+        int r = 0;
+        struct flush_entry *fe;
+        list_for_each_entry(fe, flush_list, list) {
+                r = userspace_do_request(lc, lc->uuid, fe->type,
+                                         (char *)&fe->region,
+                                         sizeof(fe->region),
+                                         NULL, NULL);
+                if (r)
+                        break;
+        }
+        return r;
+}
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
+{
+        int r = 0;
+        int count;
+        uint32_t type = 0;
+        struct flush_entry *fe, *tmp_fe;
+        LIST_HEAD(tmp_list);
+        uint64_t group[MAX_FLUSH_GROUP_COUNT];
+        /*
+         * Group process the requests
+         */
+        while (!list_empty(flush_list)) {
+                count = 0;
+                list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
+                        group[count] = fe->region;
+                        count++;
+                        list_del(&fe->list);
+                        list_add(&fe->list, &tmp_list);
+                        type = fe->type;
+                        if (count >= MAX_FLUSH_GROUP_COUNT)
+                                break;
+                }
+                r = userspace_do_request(lc, lc->uuid, type,
+                                         (char *)(group),
+                                         count * sizeof(uint64_t),
+                                         NULL, NULL);
+                if (r) {
+                        /* Group send failed.  Attempt one-by-one. */
+                        list_splice_init(&tmp_list, flush_list);
+                        r = flush_one_by_one(lc, flush_list);
+                        break;
+                }
+        }
+        /*
+         * Must collect flush_entrys that were successfully processed
+         * as a group so that they will be free'd by the caller.
+         */
+        list_splice_init(&tmp_list, flush_list);
+        return r;
+}
 /*
 * userspace_flush
 *
@@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log)
        int r = 0;
        unsigned long flags;
        struct log_c *lc = log->context;
-        LIST_HEAD(flush_list);
+        LIST_HEAD(mark_list);
+        LIST_HEAD(clear_list);
        struct flush_entry *fe, *tmp_fe;
        spin_lock_irqsave(&lc->flush_lock, flags);
-        list_splice_init(&lc->flush_list, &flush_list);
+        list_splice_init(&lc->mark_list, &mark_list);
+        list_splice_init(&lc->clear_list, &clear_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
-        if (list_empty(&flush_list))
+        if (list_empty(&mark_list) && list_empty(&clear_list))
                return 0;
-        /*
+        r = flush_by_group(lc, &mark_list);
-         * FIXME: Count up requests, group request types,
+        if (r)
-         * allocate memory to stick all requests in and
+                goto fail;
-         * send to server in one go.  Failing the allocation,
-         * do it one by one.
-         */
-        list_for_each_entry(fe, &flush_list, list) {
+        r = flush_by_group(lc, &clear_list);
-                r = userspace_do_request(lc, lc->uuid, fe->type,
+        if (r)
-                                         (char *)&fe->region,
+                goto fail;
-                                         sizeof(fe->region),
-                                         NULL, NULL);
-                if (r)
-                        goto fail;
-        }
        r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
                                 NULL, 0, NULL, NULL);
@@ -395,7 +474,11 @@ fail:
         * Calling code will receive an error and will know that
         * the log facility has failed.
         */
-        list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+        list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
+                list_del(&fe->list);
+                mempool_free(fe, flush_entry_pool);
+        }
+        list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
                list_del(&fe->list);
                mempool_free(fe, flush_entry_pool);
        }
@@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
        spin_lock_irqsave(&lc->flush_lock, flags);
        fe->type = DM_ULOG_MARK_REGION;
        fe->region = region;
-        list_add(&fe->list, &lc->flush_list);
+        list_add(&fe->list, &lc->mark_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
        return;
@@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
        spin_lock_irqsave(&lc->flush_lock, flags);
        fe->type = DM_ULOG_CLEAR_REGION;
        fe->region = region;
-        list_add(&fe->list, &lc->flush_list);
+        list_add(&fe->list, &lc->clear_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
        return;
@@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void)
                return r;
        }
-        DMINFO("version 1.0.0 loaded");
+        DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
        return 0;
 }
@@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void)
        dm_ulog_tfr_exit();
        mempool_destroy(flush_entry_pool);
-        DMINFO("version 1.0.0 unloaded");
+        DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
        return;
 }
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 075cbcf8a9f5..049eaf12aaab 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -198,6 +198,7 @@ resend:
        memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
        memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+        tfr->version = DM_ULOG_REQUEST_VERSION;
        tfr->luid = luid;
        tfr->seq = dm_ulog_seq++;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 33420e68d153..6951536ea29c 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -455,7 +455,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                        r = PTR_ERR(lc->io_req.client);
                        DMWARN("couldn't allocate disk io client");
                        kfree(lc);
-                        return -ENOMEM;
+                        return r;
                }
                lc->disk_header = vmalloc(buf_size);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 487ecda90ad4..b82d28819e2a 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -23,6 +23,8 @@
 #define DM_MSG_PREFIX "multipath"
 #define MESG_STR(x) x, sizeof(x)
+#define DM_PG_INIT_DELAY_MSECS 2000
+#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
 /* Path properties */
 struct pgpath {
@@ -33,8 +35,7 @@ struct pgpath {
        unsigned fail_count;            /* Cumulative failure count */
        struct dm_path path;
-        struct work_struct deactivate_path;
+        struct delayed_work activate_path;
-        struct work_struct activate_path;
 };
 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -65,11 +66,15 @@ struct multipath {
        const char *hw_handler_name;
        char *hw_handler_params;
        unsigned nr_priority_groups;
        struct list_head priority_groups;
+        wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
        unsigned pg_init_required;      /* pg_init needs calling? */
        unsigned pg_init_in_progress;   /* Only one pg_init allowed at once */
-        wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+        unsigned pg_init_delay_retry;   /* Delay pg_init retry? */
        unsigned nr_valid_paths;        /* Total number of usable paths */
        struct pgpath *current_pgpath;
@@ -82,6 +87,7 @@ struct multipath {
        unsigned saved_queue_if_no_path;/* Saved state during suspension */
        unsigned pg_init_retries;       /* Number of times to retry pg_init */
        unsigned pg_init_count;         /* Number of times pg_init called */
+        unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
        struct work_struct process_queued_ios;
        struct list_head queued_ios;
@@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
-static void deactivate_path(struct work_struct *work);
 /*-----------------------------------------------
@@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void)
        if (pgpath) {
                pgpath->is_active = 1;
-                INIT_WORK(&pgpath->deactivate_path, deactivate_path);
+                INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
-                INIT_WORK(&pgpath->activate_path, activate_path);
        }
        return pgpath;
@@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath)
        kfree(pgpath);
 }
-static void deactivate_path(struct work_struct *work)
-{
-        struct pgpath *pgpath =
-                container_of(work, struct pgpath, deactivate_path);
-        blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue);
-}
 static struct priority_group *alloc_priority_group(void)
 {
        struct priority_group *pg;
@@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
                INIT_LIST_HEAD(&m->queued_ios);
                spin_lock_init(&m->lock);
                m->queue_io = 1;
+                m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
                INIT_WORK(&m->process_queued_ios, process_queued_ios);
                INIT_WORK(&m->trigger_event, trigger_event);
                init_waitqueue_head(&m->pg_init_wait);
@@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m)
 static void __pg_init_all_paths(struct multipath *m)
 {
        struct pgpath *pgpath;
+        unsigned long pg_init_delay = 0;
        m->pg_init_count++;
        m->pg_init_required = 0;
+        if (m->pg_init_delay_retry)
+                pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
+                                                 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
        list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
                /* Skip failed paths */
                if (!pgpath->is_active)
                        continue;
-                if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+                if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
+                                       pg_init_delay))
                        m->pg_init_in_progress++;
        }
 }
@@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m)
        const char *param_name;
        static struct param _params[] = {
-                {0, 3, "invalid number of feature args"},
+                {0, 5, "invalid number of feature args"},
                {1, 50, "pg_init_retries must be between 1 and 50"},
+                {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
        };
        r = read_param(_params, shift(as), &argc, &ti->error);
@@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m)
                        continue;
                }
+                if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
+                    (argc >= 1)) {
+                        r = read_param(_params + 2, shift(as),
+                                       &m->pg_init_delay_msecs, &ti->error);
+                        argc--;
+                        continue;
+                }
                ti->error = "Unrecognised multipath feature request";
                r = -EINVAL;
        } while (argc && !r);
@@ -931,7 +942,7 @@ static void flush_multipath_work(struct multipath *m)
        flush_workqueue(kmpath_handlerd);
        multipath_wait_for_pg_init_completion(m);
        flush_workqueue(kmultipathd);
-        flush_scheduled_work();
+        flush_work_sync(&m->trigger_event);
 }
 static void multipath_dtr(struct dm_target *ti)
@@ -995,7 +1006,6 @@ static int fail_path(struct pgpath *pgpath)
                      pgpath->path.dev->name, m->nr_valid_paths);
        schedule_work(&m->trigger_event);
-        queue_work(kmultipathd, &pgpath->deactivate_path);
 out:
        spin_unlock_irqrestore(&m->lock, flags);
@@ -1034,7 +1044,7 @@ static int reinstate_path(struct pgpath *pgpath)
                m->current_pgpath = NULL;
                queue_work(kmultipathd, &m->process_queued_ios);
        } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
-                if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+                if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
                        m->pg_init_in_progress++;
        }
@@ -1169,6 +1179,7 @@ static void pg_init_done(void *data, int errors)
        struct priority_group *pg = pgpath->pg;
        struct multipath *m = pg->m;
        unsigned long flags;
+        unsigned delay_retry = 0;
        /* device or driver problems */
        switch (errors) {
@@ -1193,8 +1204,9 @@ static void pg_init_done(void *data, int errors)
                 */
                bypass_pg(m, pg, 1);
                break;
-        /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
        case SCSI_DH_RETRY:
+                /* Wait before retrying. */
+                delay_retry = 1;
        case SCSI_DH_IMM_RETRY:
        case SCSI_DH_RES_TEMP_UNAVAIL:
                if (pg_init_limit_reached(m, pgpath))
@@ -1227,6 +1239,7 @@ static void pg_init_done(void *data, int errors)
        if (!m->pg_init_required)
                m->queue_io = 0;
+        m->pg_init_delay_retry = delay_retry;
        queue_work(kmultipathd, &m->process_queued_ios);
        /*
@@ -1241,7 +1254,7 @@ out:
 static void activate_path(struct work_struct *work)
 {
        struct pgpath *pgpath =
-                container_of(work, struct pgpath, activate_path);
+                container_of(work, struct pgpath, activate_path.work);
        scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
                                pg_init_done, pgpath);
@@ -1382,11 +1395,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
                DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
        else {
                DMEMIT("%u ", m->queue_if_no_path +
-                              (m->pg_init_retries > 0) * 2);
+                              (m->pg_init_retries > 0) * 2 +
+                              (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
                if (m->queue_if_no_path)
                        DMEMIT("queue_if_no_path ");
                if (m->pg_init_retries)
                        DMEMIT("pg_init_retries %u ", m->pg_init_retries);
+                if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
+                        DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
        }
        if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1655,7 +1671,7 @@ out:
 *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
        .name = "multipath",
-        .version = {1, 1, 1},
+        .version = {1, 2, 0},
        .module = THIS_MODULE,
        .ctr = multipath_ctr,
        .dtr = multipath_dtr,
@@ -1687,7 +1703,7 @@ static int __init dm_multipath_init(void)
                return -EINVAL;
        }
-        kmultipathd = create_workqueue("kmpathd");
+        kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
        if (!kmultipathd) {
                DMERR("failed to create workqueue kmpathd");
                dm_unregister_target(&multipath_target);
@@ -1701,7 +1717,8 @@ static int __init dm_multipath_init(void)
         * old workqueue would also create a bottleneck in the
         * path of the storage hardware device activation.
         */
-        kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd");
+        kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
+                                                  WQ_MEM_RECLAIM);
        if (!kmpath_handlerd) {
                DMERR("failed to create workqueue kmpath_handlerd");
                destroy_workqueue(kmultipathd);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644
index 000000000000..b9e1e15ef11c
--- /dev/null
+++ b/drivers/md/dm-raid.c
@@ -0,0 +1,697 @@
+/*
+ * Copyright (C) 2010-2011 Neil Brown
+ * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/slab.h>
+#include "md.h"
+#include "raid5.h"
+#include "dm.h"
+#include "bitmap.h"
+#define DM_MSG_PREFIX "raid"
+/*
+ * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
+ * make it so the flag doesn't set anything.
+ */
+#ifndef MD_SYNC_STATE_FORCED
+#define MD_SYNC_STATE_FORCED 0
+#endif
+struct raid_dev {
+        /*
+         * Two DM devices, one to hold metadata and one to hold the
+         * actual data/parity.  The reason for this is to not confuse
+         * ti->len and give more flexibility in altering size and
+         * characteristics.
+         *
+         * While it is possible for this device to be associated
+         * with a different physical device than the data_dev, it
+         * is intended for it to be the same.
+         *    |--------- Physical Device ---------|
+         *    |- meta_dev -|------ data_dev ------|
+         */
+        struct dm_dev *meta_dev;
+        struct dm_dev *data_dev;
+        struct mdk_rdev_s rdev;
+};
+/*
+ * Flags for rs->print_flags field.
+ */
+#define DMPF_DAEMON_SLEEP      0x1
+#define DMPF_MAX_WRITE_BEHIND  0x2
+#define DMPF_SYNC              0x4
+#define DMPF_NOSYNC            0x8
+#define DMPF_STRIPE_CACHE      0x10
+#define DMPF_MIN_RECOVERY_RATE 0x20
+#define DMPF_MAX_RECOVERY_RATE 0x40
+struct raid_set {
+        struct dm_target *ti;
+        uint64_t print_flags;
+        struct mddev_s md;
+        struct raid_type *raid_type;
+        struct dm_target_callbacks callbacks;
+        struct raid_dev dev[0];
+};
+/* Supported raid types and properties. */
+static struct raid_type {
+        const char *name;               /* RAID algorithm. */
+        const char *descr;              /* Descriptor text for logging. */
+        const unsigned parity_devs;     /* # of parity devices. */
+        const unsigned minimal_devs;    /* minimal # of devices in set. */
+        const unsigned level;           /* RAID level. */
+        const unsigned algorithm;       /* RAID algorithm. */
+} raid_types[] = {
+        {"raid4",    "RAID4 (dedicated parity disk)",   1, 2, 5, ALGORITHM_PARITY_0},
+        {"raid5_la", "RAID5 (left asymmetric)",         1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
+        {"raid5_ra", "RAID5 (right asymmetric)",        1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
+        {"raid5_ls", "RAID5 (left symmetric)",          1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
+        {"raid5_rs", "RAID5 (right symmetric)",         1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
+        {"raid6_zr", "RAID6 (zero restart)",            2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
+        {"raid6_nr", "RAID6 (N restart)",               2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
+        {"raid6_nc", "RAID6 (N continue)",              2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
+};
+static struct raid_type *get_raid_type(char *name)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(raid_types); i++)
+                if (!strcmp(raid_types[i].name, name))
+                        return &raid_types[i];
+        return NULL;
+}
+static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
+{
+        unsigned i;
+        struct raid_set *rs;
+        sector_t sectors_per_dev;
+        if (raid_devs <= raid_type->parity_devs) {
+                ti->error = "Insufficient number of devices";
+                return ERR_PTR(-EINVAL);
+        }
+        sectors_per_dev = ti->len;
+        if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+                ti->error = "Target length not divisible by number of data devices";
+                return ERR_PTR(-EINVAL);
+        }
+        rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
+        if (!rs) {
+                ti->error = "Cannot allocate raid context";
+                return ERR_PTR(-ENOMEM);
+        }
+        mddev_init(&rs->md);
+        rs->ti = ti;
+        rs->raid_type = raid_type;
+        rs->md.raid_disks = raid_devs;
+        rs->md.level = raid_type->level;
+        rs->md.new_level = rs->md.level;
+        rs->md.dev_sectors = sectors_per_dev;
+        rs->md.layout = raid_type->algorithm;
+        rs->md.new_layout = rs->md.layout;
+        rs->md.delta_disks = 0;
+        rs->md.recovery_cp = 0;
+        for (i = 0; i < raid_devs; i++)
+                md_rdev_init(&rs->dev[i].rdev);
+        /*
+         * Remaining items to be initialized by further RAID params:
+         *  rs->md.persistent
+         *  rs->md.external
+         *  rs->md.chunk_sectors
+         *  rs->md.new_chunk_sectors
+         */
+        return rs;
+}
+static void context_free(struct raid_set *rs)
+{
+        int i;
+        for (i = 0; i < rs->md.raid_disks; i++)
+                if (rs->dev[i].data_dev)
+                        dm_put_device(rs->ti, rs->dev[i].data_dev);
+        kfree(rs);
+}
+/*
+ * For every device we have two words
+ *  <meta_dev>: meta device name or '-' if missing
+ *  <data_dev>: data device name or '-' if missing
+ *
+ * This code parses those words.
+ */
+static int dev_parms(struct raid_set *rs, char **argv)
+{
+        int i;
+        int rebuild = 0;
+        int metadata_available = 0;
+        int ret = 0;
+        for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
+                rs->dev[i].rdev.raid_disk = i;
+                rs->dev[i].meta_dev = NULL;
+                rs->dev[i].data_dev = NULL;
+                /*
+                 * There are no offsets, since there is a separate device
+                 * for data and metadata.
+                 */
+                rs->dev[i].rdev.data_offset = 0;
+                rs->dev[i].rdev.mddev = &rs->md;
+                if (strcmp(argv[0], "-")) {
+                        rs->ti->error = "Metadata devices not supported";
+                        return -EINVAL;
+                }
+                if (!strcmp(argv[1], "-")) {
+                        if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
+                            (!rs->dev[i].rdev.recovery_offset)) {
+                                rs->ti->error = "Drive designated for rebuild not specified";
+                                return -EINVAL;
+                        }
+                        continue;
+                }
+                ret = dm_get_device(rs->ti, argv[1],
+                                    dm_table_get_mode(rs->ti->table),
+                                    &rs->dev[i].data_dev);
+                if (ret) {
+                        rs->ti->error = "RAID device lookup failure";
+                        return ret;
+                }
+                rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
+                list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
+                if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+                        rebuild++;
+        }
+        if (metadata_available) {
+                rs->md.external = 0;
+                rs->md.persistent = 1;
+                rs->md.major_version = 2;
+        } else if (rebuild && !rs->md.recovery_cp) {
+                /*
+                 * Without metadata, we will not be able to tell if the array
+                 * is in-sync or not - we must assume it is not.  Therefore,
+                 * it is impossible to rebuild a drive.
+                 *
+                 * Even if there is metadata, the on-disk information may
+                 * indicate that the array is not in-sync and it will then
+                 * fail at that time.
+                 *
+                 * User could specify 'nosync' option if desperate.
+                 */
+                DMERR("Unable to rebuild drive while array is not in-sync");
+                rs->ti->error = "RAID device lookup failure";
+                return -EINVAL;
+        }
+        return 0;
+}
+/*
+ * Possible arguments are...
+ * RAID456:
+ *      <chunk_size> [optional_args]
+ *
+ * Optional args:
+ *    [[no]sync]                        Force or prevent recovery of the entire array
+ *    [rebuild <idx>]                   Rebuild the drive indicated by the index
+ *    [daemon_sleep <ms>]               Time between bitmap daemon work to clear bits
+ *    [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ *    [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ *    [max_write_behind <sectors>]      See '-write-behind=' (man mdadm)
+ *    [stripe_cache <sectors>]          Stripe cache size for higher RAIDs
+ */
+static int parse_raid_params(struct raid_set *rs, char **argv,
+                             unsigned num_raid_params)
+{
+        unsigned i, rebuild_cnt = 0;
+        unsigned long value;
+        char *key;
+        /*
+         * First, parse the in-order required arguments
+         */
+        if ((strict_strtoul(argv[0], 10, &value) < 0) ||
+            !is_power_of_2(value) || (value < 8)) {
+                rs->ti->error = "Bad chunk size";
+                return -EINVAL;
+        }
+        rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
+        argv++;
+        num_raid_params--;
+        /*
+         * Second, parse the unordered optional arguments
+         */
+        for (i = 0; i < rs->md.raid_disks; i++)
+                set_bit(In_sync, &rs->dev[i].rdev.flags);
+        for (i = 0; i < num_raid_params; i++) {
+                if (!strcmp(argv[i], "nosync")) {
+                        rs->md.recovery_cp = MaxSector;
+                        rs->print_flags |= DMPF_NOSYNC;
+                        rs->md.flags |= MD_SYNC_STATE_FORCED;
+                        continue;
+                }
+                if (!strcmp(argv[i], "sync")) {
+                        rs->md.recovery_cp = 0;
+                        rs->print_flags |= DMPF_SYNC;
+                        rs->md.flags |= MD_SYNC_STATE_FORCED;
+                        continue;
+                }
+                /* The rest of the optional arguments come in key/value pairs */
+                if ((i + 1) >= num_raid_params) {
+                        rs->ti->error = "Wrong number of raid parameters given";
+                        return -EINVAL;
+                }
+                key = argv[i++];
+                if (strict_strtoul(argv[i], 10, &value) < 0) {
+                        rs->ti->error = "Bad numerical argument given in raid params";
+                        return -EINVAL;
+                }
+                if (!strcmp(key, "rebuild")) {
+                        if (++rebuild_cnt > rs->raid_type->parity_devs) {
+                                rs->ti->error = "Too many rebuild drives given";
+                                return -EINVAL;
+                        }
+                        if (value > rs->md.raid_disks) {
+                                rs->ti->error = "Invalid rebuild index given";
+                                return -EINVAL;
+                        }
+                        clear_bit(In_sync, &rs->dev[value].rdev.flags);
+                        rs->dev[value].rdev.recovery_offset = 0;
+                } else if (!strcmp(key, "max_write_behind")) {
+                        rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+                        /*
+                         * In device-mapper, we specify things in sectors, but
+                         * MD records this value in kB
+                         */
+                        value /= 2;
+                        if (value > COUNTER_MAX) {
+                                rs->ti->error = "Max write-behind limit out of range";
+                                return -EINVAL;
+                        }
+                        rs->md.bitmap_info.max_write_behind = value;
+                } else if (!strcmp(key, "daemon_sleep")) {
+                        rs->print_flags |= DMPF_DAEMON_SLEEP;
+                        if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
+                                rs->ti->error = "daemon sleep period out of range";
+                                return -EINVAL;
+                        }
+                        rs->md.bitmap_info.daemon_sleep = value;
+                } else if (!strcmp(key, "stripe_cache")) {
+                        rs->print_flags |= DMPF_STRIPE_CACHE;
+                        /*
+                         * In device-mapper, we specify things in sectors, but
+                         * MD records this value in kB
+                         */
+                        value /= 2;
+                        if (rs->raid_type->level < 5) {
+                                rs->ti->error = "Inappropriate argument: stripe_cache";
+                                return -EINVAL;
+                        }
+                        if (raid5_set_cache_size(&rs->md, (int)value)) {
+                                rs->ti->error = "Bad stripe_cache size";
+                                return -EINVAL;
+                        }
+                } else if (!strcmp(key, "min_recovery_rate")) {
+                        rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+                        if (value > INT_MAX) {
+                                rs->ti->error = "min_recovery_rate out of range";
+                                return -EINVAL;
+                        }
+                        rs->md.sync_speed_min = (int)value;
+                } else if (!strcmp(key, "max_recovery_rate")) {
+                        rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+                        if (value > INT_MAX) {
+                                rs->ti->error = "max_recovery_rate out of range";
+                                return -EINVAL;
+                        }
+                        rs->md.sync_speed_max = (int)value;
+                } else {
+                        DMERR("Unable to parse RAID parameter: %s", key);
+                        rs->ti->error = "Unable to parse RAID parameters";
+                        return -EINVAL;
+                }
+        }
+        /* Assume there are no metadata devices until the drives are parsed */
+        rs->md.persistent = 0;
+        rs->md.external = 1;
+        return 0;
+}
+static void do_table_event(struct work_struct *ws)
+{
+        struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
+        dm_table_event(rs->ti->table);
+}
+static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
+{
+        struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+        return md_raid5_congested(&rs->md, bits);
+}
+static void raid_unplug(struct dm_target_callbacks *cb)
+{
+        struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+        md_raid5_unplug_device(rs->md.private);
+}
+/*
+ * Construct a RAID4/5/6 mapping:
+ * Args:
+ *      <raid_type> <#raid_params> <raid_params>                \
+ *      <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
+ *
+ * ** metadata devices are not supported yet, use '-' instead **
+ *
+ * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
+ * details on possible <raid_params>.
+ */
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+        int ret;
+        struct raid_type *rt;
+        unsigned long num_raid_params, num_raid_devs;
+        struct raid_set *rs = NULL;
+        /* Must have at least <raid_type> <#raid_params> */
+        if (argc < 2) {
+                ti->error = "Too few arguments";
+                return -EINVAL;
+        }
+        /* raid type */
+        rt = get_raid_type(argv[0]);
+        if (!rt) {
+                ti->error = "Unrecognised raid_type";
+                return -EINVAL;
+        }
+        argc--;
+        argv++;
+        /* number of RAID parameters */
+        if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
+                ti->error = "Cannot understand number of RAID parameters";
+                return -EINVAL;
+        }
+        argc--;
+        argv++;
+        /* Skip over RAID params for now and find out # of devices */
+        if (num_raid_params + 1 > argc) {
+                ti->error = "Arguments do not agree with counts given";
+                return -EINVAL;
+        }
+        if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
+            (num_raid_devs >= INT_MAX)) {
+                ti->error = "Cannot understand number of raid devices";
+                return -EINVAL;
+        }
+        rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
+        if (IS_ERR(rs))
+                return PTR_ERR(rs);
+        ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
+        if (ret)
+                goto bad;
+        ret = -EINVAL;
+        argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+        argv += num_raid_params + 1;
+        if (argc != (num_raid_devs * 2)) {
+                ti->error = "Supplied RAID devices does not match the count given";
+                goto bad;
+        }
+        ret = dev_parms(rs, argv);
+        if (ret)
+                goto bad;
+        INIT_WORK(&rs->md.event_work, do_table_event);
+        ti->split_io = rs->md.chunk_sectors;
+        ti->private = rs;
+        mutex_lock(&rs->md.reconfig_mutex);
+        ret = md_run(&rs->md);
+        rs->md.in_sync = 0; /* Assume already marked dirty */
+        mutex_unlock(&rs->md.reconfig_mutex);
+        if (ret) {
+                ti->error = "Fail to run raid array";
+                goto bad;
+        }
+        rs->callbacks.congested_fn = raid_is_congested;
+        rs->callbacks.unplug_fn = raid_unplug;
+        dm_table_add_target_callbacks(ti->table, &rs->callbacks);
+        return 0;
+bad:
+        context_free(rs);
+        return ret;
+}
+static void raid_dtr(struct dm_target *ti)
+{
+        struct raid_set *rs = ti->private;
+        list_del_init(&rs->callbacks.list);
+        md_stop(&rs->md);
+        context_free(rs);
+}
+static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
+{
+        struct raid_set *rs = ti->private;
+        mddev_t *mddev = &rs->md;
+        mddev->pers->make_request(mddev, bio);
+        return DM_MAPIO_SUBMITTED;
+}
+static int raid_status(struct dm_target *ti, status_type_t type,
+                       char *result, unsigned maxlen)
+{
+        struct raid_set *rs = ti->private;
+        unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
+        unsigned sz = 0;
+        int i;
+        sector_t sync;
+        switch (type) {
+        case STATUSTYPE_INFO:
+                DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
+                for (i = 0; i < rs->md.raid_disks; i++) {
+                        if (test_bit(Faulty, &rs->dev[i].rdev.flags))
+                                DMEMIT("D");
+                        else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                DMEMIT("A");
+                        else
+                                DMEMIT("a");
+                }
+                if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+                        sync = rs->md.curr_resync_completed;
+                else
+                        sync = rs->md.recovery_cp;
+                if (sync > rs->md.resync_max_sectors)
+                        sync = rs->md.resync_max_sectors;
+                DMEMIT(" %llu/%llu",
+                       (unsigned long long) sync,
+                       (unsigned long long) rs->md.resync_max_sectors);
+                break;
+        case STATUSTYPE_TABLE:
+                /* The string you would use to construct this array */
+                for (i = 0; i < rs->md.raid_disks; i++)
+                        if (rs->dev[i].data_dev &&
+                            !test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                raid_param_cnt++; /* for rebuilds */
+                raid_param_cnt += (hweight64(rs->print_flags) * 2);
+                if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+                        raid_param_cnt--;
+                DMEMIT("%s %u %u", rs->raid_type->name,
+                       raid_param_cnt, rs->md.chunk_sectors);
+                if ((rs->print_flags & DMPF_SYNC) &&
+                    (rs->md.recovery_cp == MaxSector))
+                        DMEMIT(" sync");
+                if (rs->print_flags & DMPF_NOSYNC)
+                        DMEMIT(" nosync");
+                for (i = 0; i < rs->md.raid_disks; i++)
+                        if (rs->dev[i].data_dev &&
+                            !test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                DMEMIT(" rebuild %u", i);
+                if (rs->print_flags & DMPF_DAEMON_SLEEP)
+                        DMEMIT(" daemon_sleep %lu",
+                               rs->md.bitmap_info.daemon_sleep);
+                if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+                        DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
+                if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+                        DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
+                if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+                        DMEMIT(" max_write_behind %lu",
+                               rs->md.bitmap_info.max_write_behind);
+                if (rs->print_flags & DMPF_STRIPE_CACHE) {
+                        raid5_conf_t *conf = rs->md.private;
+                        /* convert from kiB to sectors */
+                        DMEMIT(" stripe_cache %d",
+                               conf ? conf->max_nr_stripes * 2 : 0);
+                }
+                DMEMIT(" %d", rs->md.raid_disks);
+                for (i = 0; i < rs->md.raid_disks; i++) {
+                        DMEMIT(" -"); /* metadata device */
+                        if (rs->dev[i].data_dev)
+                                DMEMIT(" %s", rs->dev[i].data_dev->name);
+                        else
+                                DMEMIT(" -");
+                }
+        }
+        return 0;
+}
+static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+        struct raid_set *rs = ti->private;
+        unsigned i;
+        int ret = 0;
+        for (i = 0; !ret && i < rs->md.raid_disks; i++)
+                if (rs->dev[i].data_dev)
+                        ret = fn(ti,
+                                 rs->dev[i].data_dev,
+                                 0, /* No offset on data devs */
+                                 rs->md.dev_sectors,
+                                 data);
+        return ret;
+}
+static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+        struct raid_set *rs = ti->private;
+        unsigned chunk_size = rs->md.chunk_sectors << 9;
+        raid5_conf_t *conf = rs->md.private;
+        blk_limits_io_min(limits, chunk_size);
+        blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
+}
+static void raid_presuspend(struct dm_target *ti)
+{
+        struct raid_set *rs = ti->private;
+        md_stop_writes(&rs->md);
+}
+static void raid_postsuspend(struct dm_target *ti)
+{
+        struct raid_set *rs = ti->private;
+        mddev_suspend(&rs->md);
+}
+static void raid_resume(struct dm_target *ti)
+{
+        struct raid_set *rs = ti->private;
+        mddev_resume(&rs->md);
+}
+static struct target_type raid_target = {
+        .name = "raid",
+        .version = {1, 0, 0},
+        .module = THIS_MODULE,
+        .ctr = raid_ctr,
+        .dtr = raid_dtr,
+        .map = raid_map,
+        .status = raid_status,
+        .iterate_devices = raid_iterate_devices,
+        .io_hints = raid_io_hints,
+        .presuspend = raid_presuspend,
+        .postsuspend = raid_postsuspend,
+        .resume = raid_resume,
+};
+static int __init dm_raid_init(void)
+{
+        return dm_register_target(&raid_target);
+}
+static void __exit dm_raid_exit(void)
+{
+        dm_unregister_target(&raid_target);
+}
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
+MODULE_ALIAS("dm-raid6");
+MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 19a59b041c27..dee326775c60 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
        struct dm_io_request io_req = {
                .bi_rw = WRITE_FLUSH,
                .mem.type = DM_IO_KMEM,
-                .mem.ptr.bvec = NULL,
+                .mem.ptr.addr = NULL,
                .client = ms->io_client,
        };
@@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
                .client = ms->io_client,
        };
+        if (bio->bi_rw & REQ_DISCARD) {
+                io_req.bi_rw |= REQ_DISCARD;
+                io_req.mem.type = DM_IO_KMEM;
+                io_req.mem.ptr.addr = NULL;
+        }
        for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
                map_region(dest++, m, bio);
@@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
        bio_list_init(&requeue);
        while ((bio = bio_list_pop(writes))) {
-                if (bio->bi_rw & REQ_FLUSH) {
+                if ((bio->bi_rw & REQ_FLUSH) ||
+                    (bio->bi_rw & REQ_DISCARD)) {
                        bio_list_add(&sync, bio);
                        continue;
                }
@@ -1076,8 +1083,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->private = ms;
        ti->split_io = dm_rh_get_region_size(ms->rh);
        ti->num_flush_requests = 1;
+        ti->num_discard_requests = 1;
-        ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
+        ms->kmirrord_wq = alloc_workqueue("kmirrord",
+                                          WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
        if (!ms->kmirrord_wq) {
                DMERR("couldn't start kmirrord");
                r = -ENOMEM;
@@ -1130,7 +1139,7 @@ static void mirror_dtr(struct dm_target *ti)
        del_timer_sync(&ms->timer);
        flush_workqueue(ms->kmirrord_wq);
-        flush_scheduled_work();
+        flush_work_sync(&ms->trigger_event);
        dm_kcopyd_client_destroy(ms->kcopyd_client);
        destroy_workqueue(ms->kmirrord_wq);
        free_context(ms, ti, ms->nr_mirrors);
@@ -1406,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 static struct target_type mirror_target = {
        .name    = "mirror",
-        .version = {1, 12, 0},
+        .version = {1, 12, 1},
        .module  = THIS_MODULE,
        .ctr     = mirror_ctr,
        .dtr     = mirror_dtr,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2129cdb115dc..95891dfcbca0 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
         */
        INIT_WORK_ONSTACK(&req.work, do_metadata);
        queue_work(ps->metadata_wq, &req.work);
-        flush_workqueue(ps->metadata_wq);
+        flush_work(&req.work);
        return req.result;
 }
@@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store,
        atomic_set(&ps->pending_count, 0);
        ps->callbacks = NULL;
-        ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
+        ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
        if (!ps->metadata_wq) {
                kfree(ps);
                DMERR("couldn't start header metadata update thread");
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 53cf79d8bcbc..fdde53cd12b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,7 +19,6 @@
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 #include <linux/dm-kcopyd.h>
-#include <linux/workqueue.h>
 #include "dm-exception-store.h"
@@ -80,9 +79,6 @@ struct dm_snapshot {
        /* Origin writes don't trigger exceptions until this is set */
        int active;
-        /* Whether or not owning mapped_device is suspended */
-        int suspended;
        atomic_t pending_exceptions_count;
        mempool_t *pending_pool;
@@ -106,10 +102,6 @@ struct dm_snapshot {
        struct dm_kcopyd_client *kcopyd_client;
-        /* Queue of snapshot writes for ksnapd to flush */
-        struct bio_list queued_bios;
-        struct work_struct queued_bios_work;
        /* Wait for events based on state_bits */
        unsigned long state_bits;
@@ -160,9 +152,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
 }
 EXPORT_SYMBOL(dm_snap_cow);
-static struct workqueue_struct *ksnapd;
-static void flush_queued_bios(struct work_struct *work);
 static sector_t chunk_to_sector(struct dm_exception_store *store,
                                chunk_t chunk)
 {
@@ -1110,7 +1099,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        s->ti = ti;
        s->valid = 1;
        s->active = 0;
-        s->suspended = 0;
        atomic_set(&s->pending_exceptions_count, 0);
        init_rwsem(&s->lock);
        INIT_LIST_HEAD(&s->list);
@@ -1153,9 +1141,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        spin_lock_init(&s->tracked_chunk_lock);
-        bio_list_init(&s->queued_bios);
-        INIT_WORK(&s->queued_bios_work, flush_queued_bios);
        ti->private = s;
        ti->num_flush_requests = num_flush_requests;
@@ -1279,8 +1264,6 @@ static void snapshot_dtr(struct dm_target *ti)
        struct dm_snapshot *s = ti->private;
        struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
-        flush_workqueue(ksnapd);
        down_read(&_origins_lock);
        /* Check whether exception handover must be cancelled */
        (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
@@ -1342,20 +1325,6 @@ static void flush_bios(struct bio *bio)
        }
 }
-static void flush_queued_bios(struct work_struct *work)
-{
-        struct dm_snapshot *s =
-                container_of(work, struct dm_snapshot, queued_bios_work);
-        struct bio *queued_bios;
-        unsigned long flags;
-        spin_lock_irqsave(&s->pe_lock, flags);
-        queued_bios = bio_list_get(&s->queued_bios);
-        spin_unlock_irqrestore(&s->pe_lock, flags);
-        flush_bios(queued_bios);
-}
 static int do_origin(struct dm_dev *origin, struct bio *bio);
 /*
@@ -1760,15 +1729,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti)
        stop_merge(s);
 }
-static void snapshot_postsuspend(struct dm_target *ti)
-{
-        struct dm_snapshot *s = ti->private;
-        down_write(&s->lock);
-        s->suspended = 1;
-        up_write(&s->lock);
-}
 static int snapshot_preresume(struct dm_target *ti)
 {
        int r = 0;
@@ -1783,7 +1743,7 @@ static int snapshot_preresume(struct dm_target *ti)
                        DMERR("Unable to resume snapshot source until "
                              "handover completes.");
                        r = -EINVAL;
-                } else if (!snap_src->suspended) {
+                } else if (!dm_suspended(snap_src->ti)) {
                        DMERR("Unable to perform snapshot handover until "
                              "source is suspended.");
                        r = -EINVAL;
@@ -1816,7 +1776,6 @@ static void snapshot_resume(struct dm_target *ti)
        down_write(&s->lock);
        s->active = 1;
-        s->suspended = 0;
        up_write(&s->lock);
 }
@@ -2194,7 +2153,7 @@ static int origin_iterate_devices(struct dm_target *ti,
 static struct target_type origin_target = {
        .name    = "snapshot-origin",
-        .version = {1, 7, 0},
+        .version = {1, 7, 1},
        .module  = THIS_MODULE,
        .ctr     = origin_ctr,
        .dtr     = origin_dtr,
@@ -2207,13 +2166,12 @@ static struct target_type origin_target = {
 static struct target_type snapshot_target = {
        .name    = "snapshot",
-        .version = {1, 9, 0},
+        .version = {1, 10, 0},
        .module  = THIS_MODULE,
        .ctr     = snapshot_ctr,
        .dtr     = snapshot_dtr,
        .map     = snapshot_map,
        .end_io  = snapshot_end_io,
-        .postsuspend = snapshot_postsuspend,
        .preresume  = snapshot_preresume,
        .resume  = snapshot_resume,
        .status  = snapshot_status,
@@ -2222,14 +2180,13 @@ static struct target_type snapshot_target = {
 static struct target_type merge_target = {
        .name    = dm_snapshot_merge_target_name,
-        .version = {1, 0, 0},
+        .version = {1, 1, 0},
        .module  = THIS_MODULE,
        .ctr     = snapshot_ctr,
        .dtr     = snapshot_dtr,
        .map     = snapshot_merge_map,
        .end_io  = snapshot_end_io,
        .presuspend = snapshot_merge_presuspend,
-        .postsuspend = snapshot_postsuspend,
        .preresume  = snapshot_preresume,
        .resume  = snapshot_merge_resume,
        .status  = snapshot_status,
@@ -2291,17 +2248,8 @@ static int __init dm_snapshot_init(void)
                goto bad_tracked_chunk_cache;
        }
-        ksnapd = create_singlethread_workqueue("ksnapd");
-        if (!ksnapd) {
-                DMERR("Failed to create ksnapd workqueue.");
-                r = -ENOMEM;
-                goto bad_pending_pool;
-        }
        return 0;
-bad_pending_pool:
-        kmem_cache_destroy(tracked_chunk_cache);
 bad_tracked_chunk_cache:
        kmem_cache_destroy(pending_cache);
 bad_pending_cache:
@@ -2322,8 +2270,6 @@ bad_register_snapshot_target:
 static void __exit dm_snapshot_exit(void)
 {
-        destroy_workqueue(ksnapd);
        dm_unregister_target(&snapshot_target);
        dm_unregister_target(&origin_target);
        dm_unregister_target(&merge_target);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index f0371b4c4fbf..dddfa14f2982 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -39,23 +39,20 @@ struct stripe_c {
        struct dm_target *ti;
        /* Work struct used for triggering events*/
-        struct work_struct kstriped_ws;
+        struct work_struct trigger_event;
        struct stripe stripe[0];
 };
-static struct workqueue_struct *kstriped;
 /*
 * An event is triggered whenever a drive
 * drops out of a stripe volume.
 */
 static void trigger_event(struct work_struct *work)
 {
-        struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws);
+        struct stripe_c *sc = container_of(work, struct stripe_c,
+                                           trigger_event);
        dm_table_event(sc->ti->table);
 }
 static inline struct stripe_c *alloc_context(unsigned int stripes)
@@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                return -ENOMEM;
        }
-        INIT_WORK(&sc->kstriped_ws, trigger_event);
+        INIT_WORK(&sc->trigger_event, trigger_event);
        /* Set pointer to dm target; used in trigger_event */
        sc->ti = ti;
@@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti)
        for (i = 0; i < sc->stripes; i++)
                dm_put_device(ti, sc->stripe[i].dev);
-        flush_workqueue(kstriped);
+        flush_work_sync(&sc->trigger_event);
        kfree(sc);
 }
@@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
                        atomic_inc(&(sc->stripe[i].error_count));
                        if (atomic_read(&(sc->stripe[i].error_count)) <
                            DM_IO_ERROR_THRESHOLD)
-                                queue_work(kstriped, &sc->kstriped_ws);
+                                schedule_work(&sc->trigger_event);
                }
        return error;
@@ -401,7 +398,7 @@ static void stripe_io_hints(struct dm_target *ti,
 static struct target_type stripe_target = {
        .name   = "striped",
-        .version = {1, 3, 0},
+        .version = {1, 3, 1},
        .module = THIS_MODULE,
        .ctr    = stripe_ctr,
        .dtr    = stripe_dtr,
@@ -422,20 +419,10 @@ int __init dm_stripe_init(void)
                return r;
        }
-        kstriped = create_singlethread_workqueue("kstriped");
-        if (!kstriped) {
-                DMERR("failed to create workqueue kstriped");
-                dm_unregister_target(&stripe_target);
-                return -ENOMEM;
-        }
        return r;
 }
 void dm_stripe_exit(void)
 {
        dm_unregister_target(&stripe_target);
-        destroy_workqueue(kstriped);
-        return;
 }
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 985c20a4f30e..dffa0ac7c4f0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -71,6 +71,8 @@ struct dm_table {
        void *event_context;
        struct dm_md_mempools *mempools;
+        struct list_head target_callbacks;
 };
 /*
@@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
                return -ENOMEM;
        INIT_LIST_HEAD(&t->devices);
+        INIT_LIST_HEAD(&t->target_callbacks);
        atomic_set(&t->holders, 0);
        t->discards_supported = 1;
@@ -1225,10 +1228,17 @@ int dm_table_resume_targets(struct dm_table *t)
        return 0;
 }
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
+{
+        list_add(&cb->list, &t->target_callbacks);
+}
+EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 {
        struct dm_dev_internal *dd;
        struct list_head *devices = dm_table_get_devices(t);
+        struct dm_target_callbacks *cb;
        int r = 0;
        list_for_each_entry(dd, devices, list) {
@@ -1243,6 +1253,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
                                     bdevname(dd->dm_dev.bdev, b));
        }
+        list_for_each_entry(cb, &t->target_callbacks, list)
+                if (cb->congested_fn)
+                        r |= cb->congested_fn(cb, bdi_bits);
        return r;
 }
@@ -1264,6 +1278,7 @@ void dm_table_unplug_all(struct dm_table *t)
 {
        struct dm_dev_internal *dd;
        struct list_head *devices = dm_table_get_devices(t);
+        struct dm_target_callbacks *cb;
        list_for_each_entry(dd, devices, list) {
                struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
@@ -1276,6 +1291,10 @@ void dm_table_unplug_all(struct dm_table *t)
                                     dm_device_name(t->md),
                                     bdevname(dd->dm_dev.bdev, b));
        }
+        list_for_each_entry(cb, &t->target_callbacks, list)
+                if (cb->unplug_fn)
+                        cb->unplug_fn(cb);
 }
 struct mapped_device *dm_table_get_md(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f48a2f359ac4..eaa3af0e0632 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -32,7 +32,6 @@
 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
 #define DM_COOKIE_LENGTH 24
-static DEFINE_MUTEX(dm_mutex);
 static const char *_name = DM_NAME;
 static unsigned int major = 0;
@@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 {
        struct mapped_device *md;
-        mutex_lock(&dm_mutex);
        spin_lock(&_minor_lock);
        md = bdev->bd_disk->private_data;
@@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 out:
        spin_unlock(&_minor_lock);
-        mutex_unlock(&dm_mutex);
        return md ? 0 : -ENXIO;
 }
@@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
 {
        struct mapped_device *md = disk->private_data;
-        mutex_lock(&dm_mutex);
+        spin_lock(&_minor_lock);
        atomic_dec(&md->open_count);
        dm_put(md);
-        mutex_unlock(&dm_mutex);
+        spin_unlock(&_minor_lock);
        return 0;
 }
@@ -1638,13 +1637,15 @@ static void dm_request_fn(struct request_queue *q)
                if (map_request(ti, clone, md))
                        goto requeued;
-                spin_lock_irq(q->queue_lock);
+                BUG_ON(!irqs_disabled());
+                spin_lock(q->queue_lock);
        }
        goto out;
 requeued:
-        spin_lock_irq(q->queue_lock);
+        BUG_ON(!irqs_disabled());
+        spin_lock(q->queue_lock);
 plug_and_out:
        if (!elv_queue_empty(q))
@@ -1884,7 +1885,8 @@ static struct mapped_device *alloc_dev(int minor)
        add_disk(md->disk);
        format_dev_t(md->name, MKDEV(_major, minor));
-        md->wq = create_singlethread_workqueue("kdmflush");
+        md->wq = alloc_workqueue("kdmflush",
+                                 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
        if (!md->wq)
                goto bad_thread;
@@ -1992,13 +1994,14 @@ static void event_callback(void *context)
        wake_up(&md->eventq);
 }
+/*
+ * Protected by md->suspend_lock obtained by dm_swap_table().
+ */
 static void __set_size(struct mapped_device *md, sector_t size)
 {
        set_capacity(md->disk, size);
-        mutex_lock(&md->bdev->bd_inode->i_mutex);
        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-        mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 /*
author	Linus Torvalds <torvalds@linux-foundation.org>	2011-01-13 20:30:47 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-01-13 20:30:47 -0500
commit	f6bcfd94c0a97c11ce9197ade93a08bc8af6e057 (patch)
tree	83d867565b4f2a7627b3288f9e000eaf2b217be9 /drivers/md
parent	509e4aef44eb10e4aef1f81c3c3ff1214671503b (diff)
parent	9d09e663d5502c46f2d9481c04c1087e1c2da698 (diff)