36 files changed, 3682 insertions, 2487 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf1a95e31559..8420129fc5ee 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,30 @@ config DM_MIRROR
         Allow volume managers to mirror logical volumes, also
         needed for live data migration tools such as 'pvmove'.
+config DM_RAID
+       tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       select MD_RAID456
+       select BLK_DEV_MD
+       ---help---
+         A dm target that supports RAID4, RAID5 and RAID6 mappings
+         A RAID-5 set of N drives with a capacity of C MB per drive provides
+         the capacity of C * (N - 1) MB, and protects against a failure
+         of a single drive. For a given sector (row) number, (N - 1) drives
+         contain data sectors, and one drive contains the parity protection.
+         For a RAID-4 set, the parity blocks are present on a single drive,
+         while a RAID-5 set distributes the parity across the drives in one
+         of the available parity distribution methods.
+         A RAID-6 set of N drives with a capacity of C MB per drive
+         provides the capacity of C * (N - 2) MB, and protects
+         against a failure of any two drives. For a given sector
+         (row) number, (N - 2) drives contain data sectors, and two
+         drives contains two independent redundancy syndromes.  Like
+         RAID-5, RAID-6 distributes the syndromes across the drives
+         in one of the available parity distribution methods.
 config DM_LOG_USERSPACE
        tristate "Mirror userspace logging (EXPERIMENTAL)"
        depends on DM_MIRROR && EXPERIMENTAL && NET
@@ -303,4 +327,10 @@ config DM_UEVENT
        ---help---
        Generate udev events for DM events.
+config DM_FLAKEY
+       tristate "Flakey target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       ---help---
+         A target that intermittently fails I/O for debugging purposes.
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5e3aac41919d..448838b1f92a 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)        += dm-mod.o
 obj-$(CONFIG_DM_CRYPT)          += dm-crypt.o
 obj-$(CONFIG_DM_DELAY)          += dm-delay.o
+obj-$(CONFIG_DM_FLAKEY)         += dm-flakey.o
 obj-$(CONFIG_DM_MULTIPATH)      += dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)   += dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)   += dm-service-time.o
@@ -36,6 +37,7 @@ obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)         += dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_LOG_USERSPACE)  += dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)           += dm-zero.o
+obj-$(CONFIG_DM_RAID)   += dm-raid.o
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs                     += dm-uevent.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e4fb58db5454..574b09afedd3 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
                    || test_bit(Faulty, &rdev->flags))
                        continue;
-                target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
+                target = offset + index * (PAGE_SIZE/512);
-                if (sync_page_io(rdev->bdev, target,
+                if (sync_page_io(rdev, target,
                                 roundup(size, bdev_logical_block_size(rdev->bdev)),
-                                 page, READ)) {
+                                 page, READ, true)) {
                        page->index = index;
                        attach_page_buffers(page, NULL); /* so that free_buffer will
                                                          * quietly no-op */
@@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
        mdk_rdev_t *rdev = NULL;
+        struct block_device *bdev;
        mddev_t *mddev = bitmap->mddev;
        while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
                int size = PAGE_SIZE;
                loff_t offset = mddev->bitmap_info.offset;
+                bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
                if (page->index == bitmap->file_pages-1)
                        size = roundup(bitmap->last_page_size,
-                                       bdev_logical_block_size(rdev->bdev));
+                                       bdev_logical_block_size(bdev));
                /* Just make sure we aren't corrupting data or
                 * metadata
                 */
@@ -343,7 +347,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
                        atomic_inc(&bitmap->pending_writes);
                        set_buffer_locked(bh);
                        set_buffer_mapped(bh);
-                        submit_bh(WRITE, bh);
+                        submit_bh(WRITE | REQ_SYNC, bh);
                        bh = bh->b_this_page;
                }
@@ -489,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
        spin_unlock_irqrestore(&bitmap->lock, flags);
        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
        sb->events = cpu_to_le64(bitmap->mddev->events);
-        if (bitmap->mddev->events < bitmap->events_cleared) {
+        if (bitmap->mddev->events < bitmap->events_cleared)
                /* rocking back to read-only */
                bitmap->events_cleared = bitmap->mddev->events;
-                sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
+        sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
-        }
+        sb->state = cpu_to_le32(bitmap->flags);
        /* Just in case these have been changed via sysfs: */
        sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
        sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
@@ -530,6 +534,82 @@ void bitmap_print_sb(struct bitmap *bitmap)
        kunmap_atomic(sb, KM_USER0);
 }
+/*
+ * bitmap_new_disk_sb
+ * @bitmap
+ *
+ * This function is somewhat the reverse of bitmap_read_sb.  bitmap_read_sb
+ * reads and verifies the on-disk bitmap superblock and populates bitmap_info.
+ * This function verifies 'bitmap_info' and populates the on-disk bitmap
+ * structure, which is to be written to disk.
+ *
+ * Returns: 0 on success, -Exxx on error
+ */
+static int bitmap_new_disk_sb(struct bitmap *bitmap)
+{
+        bitmap_super_t *sb;
+        unsigned long chunksize, daemon_sleep, write_behind;
+        int err = -EINVAL;
+        bitmap->sb_page = alloc_page(GFP_KERNEL);
+        if (IS_ERR(bitmap->sb_page)) {
+                err = PTR_ERR(bitmap->sb_page);
+                bitmap->sb_page = NULL;
+                return err;
+        }
+        bitmap->sb_page->index = 0;
+        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+        sb->magic = cpu_to_le32(BITMAP_MAGIC);
+        sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
+        chunksize = bitmap->mddev->bitmap_info.chunksize;
+        BUG_ON(!chunksize);
+        if (!is_power_of_2(chunksize)) {
+                kunmap_atomic(sb, KM_USER0);
+                printk(KERN_ERR "bitmap chunksize not a power of 2\n");
+                return -EINVAL;
+        }
+        sb->chunksize = cpu_to_le32(chunksize);
+        daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep;
+        if (!daemon_sleep ||
+            (daemon_sleep < 1) || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
+                printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n");
+                daemon_sleep = 5 * HZ;
+        }
+        sb->daemon_sleep = cpu_to_le32(daemon_sleep);
+        bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
+        /*
+         * FIXME: write_behind for RAID1.  If not specified, what
+         * is a good choice?  We choose COUNTER_MAX / 2 arbitrarily.
+         */
+        write_behind = bitmap->mddev->bitmap_info.max_write_behind;
+        if (write_behind > COUNTER_MAX)
+                write_behind = COUNTER_MAX / 2;
+        sb->write_behind = cpu_to_le32(write_behind);
+        bitmap->mddev->bitmap_info.max_write_behind = write_behind;
+        /* keep the array size field of the bitmap superblock up to date */
+        sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
+        memcpy(sb->uuid, bitmap->mddev->uuid, 16);
+        bitmap->flags |= BITMAP_STALE;
+        sb->state |= cpu_to_le32(BITMAP_STALE);
+        bitmap->events_cleared = bitmap->mddev->events;
+        sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
+        bitmap->flags |= BITMAP_HOSTENDIAN;
+        sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN);
+        kunmap_atomic(sb, KM_USER0);
+        return 0;
+}
 /* read the superblock from the bitmap file and initialize some bitmap fields */
 static int bitmap_read_sb(struct bitmap *bitmap)
 {
@@ -571,7 +651,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
                reason = "unrecognized superblock version";
        else if (chunksize < 512)
                reason = "bitmap chunksize too small";
-        else if ((1 << ffz(~chunksize)) != chunksize)
+        else if (!is_power_of_2(chunksize))
                reason = "bitmap chunksize not a power of 2";
        else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT)
                reason = "daemon sleep period out of range";
@@ -614,7 +694,7 @@ success:
        if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
                bitmap->flags |= BITMAP_HOSTENDIAN;
        bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
-        if (sb->state & cpu_to_le32(BITMAP_STALE))
+        if (bitmap->flags & BITMAP_STALE)
                bitmap->events_cleared = bitmap->mddev->events;
        err = 0;
 out:
@@ -648,9 +728,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
        switch (op) {
        case MASK_SET:
                sb->state |= cpu_to_le32(bits);
+                bitmap->flags |= bits;
                break;
        case MASK_UNSET:
                sb->state &= cpu_to_le32(~bits);
+                bitmap->flags &= ~bits;
                break;
        default:
                BUG();
@@ -850,7 +932,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
                if (bitmap->flags & BITMAP_HOSTENDIAN)
                        set_bit(bit, kaddr);
                else
-                        ext2_set_bit(bit, kaddr);
+                        __test_and_set_bit_le(bit, kaddr);
                kunmap_atomic(kaddr, KM_USER0);
                PRINTK("set file bit %lu page %lu\n", bit, page->index);
        }
@@ -1046,7 +1128,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
                if (bitmap->flags & BITMAP_HOSTENDIAN)
                        b = test_bit(bit, paddr);
                else
-                        b = ext2_test_bit(bit, paddr);
+                        b = test_bit_le(bit, paddr);
                kunmap_atomic(paddr, KM_USER0);
                if (b) {
                        /* if the disk bit is set, set the memory bit */
@@ -1070,8 +1152,8 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
        }
        printk(KERN_INFO "%s: bitmap initialized from disk: "
-                "read %lu/%lu pages, set %lu bits\n",
+               "read %lu/%lu pages, set %lu of %lu bits\n",
-                bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt);
+               bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks);
        return 0;
@@ -1101,7 +1183,7 @@ static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
        bitmap_checkfree(bitmap, page);
 }
 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
-                                            sector_t offset, int *blocks,
+                                            sector_t offset, sector_t *blocks,
                                            int create);
 /*
@@ -1115,7 +1197,7 @@ void bitmap_daemon_work(mddev_t *mddev)
        unsigned long j;
        unsigned long flags;
        struct page *page = NULL, *lastpage = NULL;
-        int blocks;
+        sector_t blocks;
        void *paddr;
        struct dm_dirty_log *log = mddev->bitmap_info.log;
@@ -1222,7 +1304,7 @@ void bitmap_daemon_work(mddev_t *mddev)
                                                clear_bit(file_page_offset(bitmap, j),
                                                          paddr);
                                        else
-                                                ext2_clear_bit(file_page_offset(bitmap, j),
+                                                __test_and_clear_bit_le(file_page_offset(bitmap, j),
                                                               paddr);
                                        kunmap_atomic(paddr, KM_USER0);
                                } else
@@ -1258,7 +1340,7 @@ void bitmap_daemon_work(mddev_t *mddev)
 }
 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
-                                            sector_t offset, int *blocks,
+                                            sector_t offset, sector_t *blocks,
                                            int create)
 __releases(bitmap->lock)
 __acquires(bitmap->lock)
@@ -1316,7 +1398,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
        }
        while (sectors) {
-                int blocks;
+                sector_t blocks;
                bitmap_counter_t *bmc;
                spin_lock_irq(&bitmap->lock);
@@ -1326,7 +1408,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
                        return 0;
                }
-                if (unlikely((*bmc & COUNTER_MAX) == COUNTER_MAX)) {
+                if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) {
                        DEFINE_WAIT(__wait);
                        /* note that it is safe to do the prepare_to_wait
                         * after the test as long as we do it before dropping
@@ -1335,8 +1417,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
                        prepare_to_wait(&bitmap->overflow_wait, &__wait,
                                        TASK_UNINTERRUPTIBLE);
                        spin_unlock_irq(&bitmap->lock);
-                        md_unplug(bitmap->mddev);
+                        io_schedule();
-                        schedule();
                        finish_wait(&bitmap->overflow_wait, &__wait);
                        continue;
                }
@@ -1381,7 +1462,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
                success = 0;
        while (sectors) {
-                int blocks;
+                sector_t blocks;
                unsigned long flags;
                bitmap_counter_t *bmc;
@@ -1399,10 +1480,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
                        sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
                }
-                if (!success && ! (*bmc & NEEDED_MASK))
+                if (!success && !NEEDED(*bmc))
                        *bmc |= NEEDED_MASK;
-                if ((*bmc & COUNTER_MAX) == COUNTER_MAX)
+                if (COUNTER(*bmc) == COUNTER_MAX)
                        wake_up(&bitmap->overflow_wait);
                (*bmc)--;
@@ -1423,7 +1504,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 }
 EXPORT_SYMBOL(bitmap_endwrite);
-static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
+static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
                               int degraded)
 {
        bitmap_counter_t *bmc;
@@ -1452,7 +1533,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *bloc
        return rv;
 }
-int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
                      int degraded)
 {
        /* bitmap_start_sync must always report on multiples of whole
@@ -1463,7 +1544,7 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
         * Return the 'or' of the result.
         */
        int rv = 0;
-        int blocks1;
+        sector_t blocks1;
        *blocks = 0;
        while (*blocks < (PAGE_SIZE>>9)) {
@@ -1476,7 +1557,7 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
 }
 EXPORT_SYMBOL(bitmap_start_sync);
-void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted)
+void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted)
 {
        bitmap_counter_t *bmc;
        unsigned long flags;
@@ -1515,7 +1596,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
         * RESYNC bit wherever it is still on
         */
        sector_t sector = 0;
-        int blocks;
+        sector_t blocks;
        if (!bitmap)
                return;
        while (sector < bitmap->mddev->resync_max_sectors) {
@@ -1528,7 +1609,7 @@ EXPORT_SYMBOL(bitmap_close_sync);
 void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
 {
        sector_t s = 0;
-        int blocks;
+        sector_t blocks;
        if (!bitmap)
                return;
@@ -1542,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
        wait_event(bitmap->mddev->recovery_wait,
                   atomic_read(&bitmap->mddev->recovery_active) == 0);
-        bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
+        bitmap->mddev->curr_resync_completed = sector;
        set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
        sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
        s = 0;
@@ -1562,7 +1643,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
         * be 0 at this point
         */
-        int secs;
+        sector_t secs;
        bitmap_counter_t *bmc;
        spin_lock_irq(&bitmap->lock);
        bmc = bitmap_get_counter(bitmap, offset, &secs, 1);
@@ -1723,9 +1804,16 @@ int bitmap_create(mddev_t *mddev)
                vfs_fsync(file, 1);
        }
        /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */
-        if (!mddev->bitmap_info.external)
+        if (!mddev->bitmap_info.external) {
-                err = bitmap_read_sb(bitmap);
+                /*
-        else {
+                 * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is
+                 * instructing us to create a new on-disk bitmap instance.
+                 */
+                if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags))
+                        err = bitmap_new_disk_sb(bitmap);
+                else
+                        err = bitmap_read_sb(bitmap);
+        } else {
                err = 0;
                if (mddev->bitmap_info.chunksize == 0 ||
                    mddev->bitmap_info.daemon_sleep == 0)
@@ -1749,9 +1837,6 @@ int bitmap_create(mddev_t *mddev)
        bitmap->chunks = chunks;
        bitmap->pages = pages;
        bitmap->missing_pages = pages;
-        bitmap->counter_bits = COUNTER_BITS;
-        bitmap->syncchunk = ~0UL;
 #ifdef INJECT_FATAL_FAULT_1
        bitmap->bp = NULL;
@@ -1790,7 +1875,7 @@ int bitmap_load(mddev_t *mddev)
         * All chunks should be clean, but some might need_sync.
         */
        while (sector < mddev->resync_max_sectors) {
-                int blocks;
+                sector_t blocks;
                bitmap_start_sync(bitmap, sector, &blocks, 0);
                sector += blocks;
        }
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index e872a7bad6b8..b2a127e891ac 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -45,7 +45,7 @@
 *
 * The counter counts pending write requests, plus the on-disk bit.
 * When the counter is '1' and the resync bits are clear, the on-disk
- * bit can be cleared aswell, thus setting the counter to 0.
+ * bit can be cleared as well, thus setting the counter to 0.
 * When we set a bit, or in the counter (to start a write), if the fields is
 * 0, we first set the disk bit and set the counter to 1.
 *
@@ -85,7 +85,6 @@
 typedef __u16 bitmap_counter_t;
 #define COUNTER_BITS 16
 #define COUNTER_BIT_SHIFT 4
-#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
 #define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
 #define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
@@ -196,19 +195,10 @@ struct bitmap {
        mddev_t *mddev; /* the md device that the bitmap is for */
-        int counter_bits; /* how many bits per block counter */
        /* bitmap chunksize -- how much data does each bit represent? */
        unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
        unsigned long chunks; /* total number of data chunks for the array */
-        /* We hold a count on the chunk currently being synced, and drop
-         * it when the last block is started.  If the resync is aborted
-         * midway, we need to be able to drop that count, so we remember
-         * the counted chunk..
-         */
-        unsigned long syncchunk;
        __u64   events_cleared;
        int need_sync;
@@ -271,8 +261,8 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
                        unsigned long sectors, int behind);
 void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
                        unsigned long sectors, int success, int behind);
-int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
-void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
+void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
 void bitmap_close_sync(struct bitmap *bitmap);
 void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 368e8e98f705..c8827ffd85bb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,10 +18,14 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
+#include <linux/percpu.h>
 #include <asm/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
 #include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/md5.h>
+#include <crypto/algapi.h>
 #include <linux/device-mapper.h>
@@ -63,6 +67,7 @@ struct dm_crypt_request {
        struct convert_context *ctx;
        struct scatterlist sg_in;
        struct scatterlist sg_out;
+        sector_t iv_sector;
 };
 struct crypt_config;
@@ -73,11 +78,13 @@ struct crypt_iv_operations {
        void (*dtr)(struct crypt_config *cc);
        int (*init)(struct crypt_config *cc);
        int (*wipe)(struct crypt_config *cc);
-        int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
+        int (*generator)(struct crypt_config *cc, u8 *iv,
+                         struct dm_crypt_request *dmreq);
+        int (*post)(struct crypt_config *cc, u8 *iv,
+                    struct dm_crypt_request *dmreq);
 };
 struct iv_essiv_private {
-        struct crypto_cipher *tfm;
        struct crypto_hash *hash_tfm;
        u8 *salt;
 };
@@ -86,11 +93,32 @@ struct iv_benbi_private {
        int shift;
 };
+#define LMK_SEED_SIZE 64 /* hash + 0 */
+struct iv_lmk_private {
+        struct crypto_shash *hash_tfm;
+        u8 *seed;
+};
 /*
 * Crypt: maps a linear range of a block device
 * and encrypts / decrypts at the same time.
 */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+/*
+ * Duplicated per-CPU state for cipher.
+ */
+struct crypt_cpu {
+        struct ablkcipher_request *req;
+        /* ESSIV: struct crypto_cipher *essiv_tfm */
+        void *iv_private;
+        struct crypto_ablkcipher *tfms[0];
+};
+/*
+ * The fields in here must be read only after initialization,
+ * changing state should be in crypt_cpu.
+ */
 struct crypt_config {
        struct dm_dev *dev;
        sector_t start;
@@ -108,17 +136,25 @@ struct crypt_config {
        struct workqueue_struct *crypt_queue;
        char *cipher;
-        char *cipher_mode;
+        char *cipher_string;
        struct crypt_iv_operations *iv_gen_ops;
        union {
                struct iv_essiv_private essiv;
                struct iv_benbi_private benbi;
+                struct iv_lmk_private lmk;
        } iv_gen_private;
        sector_t iv_offset;
        unsigned int iv_size;
        /*
+         * Duplicated per cpu state. Access through
+         * per_cpu_ptr() only.
+         */
+        struct crypt_cpu __percpu *cpu;
+        unsigned tfms_count;
+        /*
         * Layout of each crypto request:
         *
         *   struct ablkcipher_request
@@ -132,11 +168,10 @@ struct crypt_config {
         * correctly aligned.
         */
        unsigned int dmreq_start;
-        struct ablkcipher_request *req;
-        struct crypto_ablkcipher *tfm;
        unsigned long flags;
        unsigned int key_size;
+        unsigned int key_parts;
        u8 key[0];
 };
@@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool;
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
+static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
+static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
+{
+        return this_cpu_ptr(cc->cpu);
+}
+/*
+ * Use this to access cipher attributes that are the same for each CPU.
+ */
+static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
+{
+        return __this_cpu_ptr(cc->cpu)->tfms[0];
+}
 /*
 * Different IV generation algorithms:
@@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 * null: the initial vector is always zero.  Provides compatibility with
 *       obsolete loop_fish2 devices.  Do not use for new devices.
 *
+ * lmk:  Compatible implementation of the block chaining mode used
+ *       by the Loop-AES block device encryption system
+ *       designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
+ *       It operates on full 512 byte sectors and uses CBC
+ *       with an IV derived from the sector number, the data and
+ *       optionally extra IV seed.
+ *       This means that after decryption the first block
+ *       of sector must be tweaked according to decrypted data.
+ *       Loop-AES can use three encryption schemes:
+ *         version 1: is plain aes-cbc mode
+ *         version 2: uses 64 multikey scheme with lmk IV generator
+ *         version 3: the same as version 2 with additional IV seed
+ *                   (it uses 65 keys, last key is used as IV seed)
+ *
 * plumb: unimplemented, see:
 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
 */
-static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
+                              struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
-        *(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
+        *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
        return 0;
 }
 static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
-                                sector_t sector)
+                                struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
-        *(u64 *)iv = cpu_to_le64(sector);
+        *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
        return 0;
 }
@@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
        struct hash_desc desc;
        struct scatterlist sg;
-        int err;
+        struct crypto_cipher *essiv_tfm;
+        int err, cpu;
        sg_init_one(&sg, cc->key, cc->key_size);
        desc.tfm = essiv->hash_tfm;
@@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
        if (err)
                return err;
-        return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+        for_each_possible_cpu(cpu) {
+                essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
+                err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
                                    crypto_hash_digestsize(essiv->hash_tfm));
+                if (err)
+                        return err;
+        }
+        return 0;
 }
 /* Wipe salt and reset key derived from volume key */
@@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 {
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
        unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+        struct crypto_cipher *essiv_tfm;
+        int cpu, r, err = 0;
        memset(essiv->salt, 0, salt_size);
-        return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+        for_each_possible_cpu(cpu) {
+                essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
+                r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
+                if (r)
+                        err = r;
+        }
+        return err;
+}
+/* Set up per cpu cipher state */
+static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
+                                             struct dm_target *ti,
+                                             u8 *salt, unsigned saltsize)
+{
+        struct crypto_cipher *essiv_tfm;
+        int err;
+        /* Setup the essiv_tfm with the given salt */
+        essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+        if (IS_ERR(essiv_tfm)) {
+                ti->error = "Error allocating crypto tfm for ESSIV";
+                return essiv_tfm;
+        }
+        if (crypto_cipher_blocksize(essiv_tfm) !=
+            crypto_ablkcipher_ivsize(any_tfm(cc))) {
+                ti->error = "Block size of ESSIV cipher does "
+                            "not match IV size of block cipher";
+                crypto_free_cipher(essiv_tfm);
+                return ERR_PTR(-EINVAL);
+        }
+        err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
+        if (err) {
+                ti->error = "Failed to set key for ESSIV cipher";
+                crypto_free_cipher(essiv_tfm);
+                return ERR_PTR(err);
+        }
+        return essiv_tfm;
 }
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
+        int cpu;
+        struct crypt_cpu *cpu_cc;
+        struct crypto_cipher *essiv_tfm;
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
-        crypto_free_cipher(essiv->tfm);
-        essiv->tfm = NULL;
        crypto_free_hash(essiv->hash_tfm);
        essiv->hash_tfm = NULL;
        kzfree(essiv->salt);
        essiv->salt = NULL;
+        for_each_possible_cpu(cpu) {
+                cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+                essiv_tfm = cpu_cc->iv_private;
+                if (essiv_tfm)
+                        crypto_free_cipher(essiv_tfm);
+                cpu_cc->iv_private = NULL;
+        }
 }
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
        struct crypto_cipher *essiv_tfm = NULL;
        struct crypto_hash *hash_tfm = NULL;
        u8 *salt = NULL;
-        int err;
+        int err, cpu;
        if (!opts) {
                ti->error = "Digest algorithm missing for ESSIV mode";
@@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
                goto bad;
        }
-        /* Allocate essiv_tfm */
-        essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
-        if (IS_ERR(essiv_tfm)) {
-                ti->error = "Error allocating crypto tfm for ESSIV";
-                err = PTR_ERR(essiv_tfm);
-                goto bad;
-        }
-        if (crypto_cipher_blocksize(essiv_tfm) !=
-            crypto_ablkcipher_ivsize(cc->tfm)) {
-                ti->error = "Block size of ESSIV cipher does "
-                            "not match IV size of block cipher";
-                err = -EINVAL;
-                goto bad;
-        }
        cc->iv_gen_private.essiv.salt = salt;
-        cc->iv_gen_private.essiv.tfm = essiv_tfm;
        cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
+        for_each_possible_cpu(cpu) {
+                essiv_tfm = setup_essiv_cpu(cc, ti, salt,
+                                        crypto_hash_digestsize(hash_tfm));
+                if (IS_ERR(essiv_tfm)) {
+                        crypt_iv_essiv_dtr(cc);
+                        return PTR_ERR(essiv_tfm);
+                }
+                per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
+        }
        return 0;
 bad:
-        if (essiv_tfm && !IS_ERR(essiv_tfm))
-                crypto_free_cipher(essiv_tfm);
        if (hash_tfm && !IS_ERR(hash_tfm))
                crypto_free_hash(hash_tfm);
        kfree(salt);
        return err;
 }
-static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
+                              struct dm_crypt_request *dmreq)
 {
+        struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
        memset(iv, 0, cc->iv_size);
-        *(u64 *)iv = cpu_to_le64(sector);
+        *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
-        crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
+        crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
        return 0;
 }
 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
                              const char *opts)
 {
-        unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
+        unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
        int log = ilog2(bs);
        /* we need to calculate how far we must shift the sector count
@@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
 {
 }
-static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
+                              struct dm_crypt_request *dmreq)
 {
        __be64 val;
        memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
-        val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
+        val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
        put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
        return 0;
 }
-static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
+                             struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
        return 0;
 }
+static void crypt_iv_lmk_dtr(struct crypt_config *cc)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
+                crypto_free_shash(lmk->hash_tfm);
+        lmk->hash_tfm = NULL;
+        kzfree(lmk->seed);
+        lmk->seed = NULL;
+}
+static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
+                            const char *opts)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
+        if (IS_ERR(lmk->hash_tfm)) {
+                ti->error = "Error initializing LMK hash";
+                return PTR_ERR(lmk->hash_tfm);
+        }
+        /* No seed in LMK version 2 */
+        if (cc->key_parts == cc->tfms_count) {
+                lmk->seed = NULL;
+                return 0;
+        }
+        lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
+        if (!lmk->seed) {
+                crypt_iv_lmk_dtr(cc);
+                ti->error = "Error kmallocing seed storage in LMK";
+                return -ENOMEM;
+        }
+        return 0;
+}
+static int crypt_iv_lmk_init(struct crypt_config *cc)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        int subkey_size = cc->key_size / cc->key_parts;
+        /* LMK seed is on the position of LMK_KEYS + 1 key */
+        if (lmk->seed)
+                memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
+                       crypto_shash_digestsize(lmk->hash_tfm));
+        return 0;
+}
+static int crypt_iv_lmk_wipe(struct crypt_config *cc)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        if (lmk->seed)
+                memset(lmk->seed, 0, LMK_SEED_SIZE);
+        return 0;
+}
+static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
+                            struct dm_crypt_request *dmreq,
+                            u8 *data)
+{
+        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+        struct {
+                struct shash_desc desc;
+                char ctx[crypto_shash_descsize(lmk->hash_tfm)];
+        } sdesc;
+        struct md5_state md5state;
+        u32 buf[4];
+        int i, r;
+        sdesc.desc.tfm = lmk->hash_tfm;
+        sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        r = crypto_shash_init(&sdesc.desc);
+        if (r)
+                return r;
+        if (lmk->seed) {
+                r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
+                if (r)
+                        return r;
+        }
+        /* Sector is always 512B, block size 16, add data of blocks 1-31 */
+        r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
+        if (r)
+                return r;
+        /* Sector is cropped to 56 bits here */
+        buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
+        buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
+        buf[2] = cpu_to_le32(4024);
+        buf[3] = 0;
+        r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
+        if (r)
+                return r;
+        /* No MD5 padding here */
+        r = crypto_shash_export(&sdesc.desc, &md5state);
+        if (r)
+                return r;
+        for (i = 0; i < MD5_HASH_WORDS; i++)
+                __cpu_to_le32s(&md5state.hash[i]);
+        memcpy(iv, &md5state.hash, cc->iv_size);
+        return 0;
+}
+static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
+                            struct dm_crypt_request *dmreq)
+{
+        u8 *src;
+        int r = 0;
+        if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+                src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
+                r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
+                kunmap_atomic(src, KM_USER0);
+        } else
+                memset(iv, 0, cc->iv_size);
+        return r;
+}
+static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
+                             struct dm_crypt_request *dmreq)
+{
+        u8 *dst;
+        int r;
+        if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
+                return 0;
+        dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
+        r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
+        /* Tweak the first block of plaintext sector */
+        if (!r)
+                crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
+        kunmap_atomic(dst, KM_USER0);
+        return r;
+}
 static struct crypt_iv_operations crypt_iv_plain_ops = {
        .generator = crypt_iv_plain_gen
 };
@@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
        .generator = crypt_iv_null_gen
 };
+static struct crypt_iv_operations crypt_iv_lmk_ops = {
+        .ctr       = crypt_iv_lmk_ctr,
+        .dtr       = crypt_iv_lmk_dtr,
+        .init      = crypt_iv_lmk_init,
+        .wipe      = crypt_iv_lmk_wipe,
+        .generator = crypt_iv_lmk_gen,
+        .post      = crypt_iv_lmk_post
+};
 static void crypt_convert_init(struct crypt_config *cc,
                               struct convert_context *ctx,
                               struct bio *bio_out, struct bio *bio_in,
@@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
        return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
 }
+static u8 *iv_of_dmreq(struct crypt_config *cc,
+                       struct dm_crypt_request *dmreq)
+{
+        return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+                crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
+}
 static int crypt_convert_block(struct crypt_config *cc,
                               struct convert_context *ctx,
                               struct ablkcipher_request *req)
@@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc,
        int r = 0;
        dmreq = dmreq_of_req(cc, req);
-        iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
+        iv = iv_of_dmreq(cc, dmreq);
-                         crypto_ablkcipher_alignmask(cc->tfm) + 1);
+        dmreq->iv_sector = ctx->sector;
        dmreq->ctx = ctx;
        sg_init_table(&dmreq->sg_in, 1);
        sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc,
        }
        if (cc->iv_gen_ops) {
-                r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
+                r = cc->iv_gen_ops->generator(cc, iv, dmreq);
                if (r < 0)
                        return r;
        }
@@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc,
        else
                r = crypto_ablkcipher_decrypt(req);
+        if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+                r = cc->iv_gen_ops->post(cc, iv, dmreq);
        return r;
 }
 static void kcryptd_async_done(struct crypto_async_request *async_req,
                               int error);
 static void crypt_alloc_req(struct crypt_config *cc,
                            struct convert_context *ctx)
 {
-        if (!cc->req)
+        struct crypt_cpu *this_cc = this_crypt_config(cc);
-                cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+        unsigned key_index = ctx->sector & (cc->tfms_count - 1);
-        ablkcipher_request_set_tfm(cc->req, cc->tfm);
-        ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+        if (!this_cc->req)
-                                        CRYPTO_TFM_REQ_MAY_SLEEP,
+                this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
-                                        kcryptd_async_done,
-                                        dmreq_of_req(cc, cc->req));
+        ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]);
+        ablkcipher_request_set_callback(this_cc->req,
+            CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+            kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
 }
 /*
@@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
 static int crypt_convert(struct crypt_config *cc,
                         struct convert_context *ctx)
 {
+        struct crypt_cpu *this_cc = this_crypt_config(cc);
        int r;
        atomic_set(&ctx->pending, 1);
@@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc,
                atomic_inc(&ctx->pending);
-                r = crypt_convert_block(cc, ctx, cc->req);
+                r = crypt_convert_block(cc, ctx, this_cc->req);
                switch (r) {
                /* async */
@@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc,
                        INIT_COMPLETION(ctx->restart);
                        /* fall through*/
                case -EINPROGRESS:
-                        cc->req = NULL;
+                        this_cc->req = NULL;
                        ctx->sector++;
                        continue;
@@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 * They must be separated as otherwise the final stages could be
 * starved by new requests which can block in the first stages due
 * to memory allocation.
+ *
+ * The work is done per CPU global for all dm-crypt instances.
+ * They should not depend on each other and do not block.
 */
 static void crypt_endio(struct bio *clone, int error)
 {
@@ -691,25 +991,22 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
        clone->bi_destructor = dm_crypt_bio_destructor;
 }
-static void kcryptd_io_read(struct dm_crypt_io *io)
+static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
        struct crypt_config *cc = io->target->private;
        struct bio *base_bio = io->base_bio;
        struct bio *clone;
-        crypt_inc_pending(io);
        /*
         * The block layer might modify the bvec array, so always
         * copy the required bvecs because we need the original
         * one in order to decrypt the whole bio data *afterwards*.
         */
-        clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
+        clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
-        if (unlikely(!clone)) {
+        if (!clone)
-                io->error = -ENOMEM;
+                return 1;
-                crypt_dec_pending(io);
-                return;
+        crypt_inc_pending(io);
-        }
        clone_init(io, clone);
        clone->bi_idx = 0;
@@ -720,6 +1017,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
               sizeof(struct bio_vec) * clone->bi_vcnt);
        generic_make_request(clone);
+        return 0;
 }
 static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -732,9 +1030,12 @@ static void kcryptd_io(struct work_struct *work)
 {
        struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
-        if (bio_data_dir(io->base_bio) == READ)
+        if (bio_data_dir(io->base_bio) == READ) {
-                kcryptd_io_read(io);
+                crypt_inc_pending(io);
-        else
+                if (kcryptd_io_read(io, GFP_NOIO))
+                        io->error = -ENOMEM;
+                crypt_dec_pending(io);
+        } else
                kcryptd_io_write(io);
 }
@@ -901,6 +1202,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
                return;
        }
+        if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
+                error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
        mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
        if (!atomic_dec_and_test(&ctx->pending))
@@ -971,34 +1275,93 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
        }
 }
+static void crypt_free_tfms(struct crypt_config *cc, int cpu)
+{
+        struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+        unsigned i;
+        for (i = 0; i < cc->tfms_count; i++)
+                if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
+                        crypto_free_ablkcipher(cpu_cc->tfms[i]);
+                        cpu_cc->tfms[i] = NULL;
+                }
+}
+static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
+{
+        struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+        unsigned i;
+        int err;
+        for (i = 0; i < cc->tfms_count; i++) {
+                cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
+                if (IS_ERR(cpu_cc->tfms[i])) {
+                        err = PTR_ERR(cpu_cc->tfms[i]);
+                        crypt_free_tfms(cc, cpu);
+                        return err;
+                }
+        }
+        return 0;
+}
+static int crypt_setkey_allcpus(struct crypt_config *cc)
+{
+        unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
+        int cpu, err = 0, i, r;
+        for_each_possible_cpu(cpu) {
+                for (i = 0; i < cc->tfms_count; i++) {
+                        r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i],
+                                                     cc->key + (i * subkey_size), subkey_size);
+                        if (r)
+                                err = r;
+                }
+        }
+        return err;
+}
 static int crypt_set_key(struct crypt_config *cc, char *key)
 {
-        unsigned key_size = strlen(key) >> 1;
+        int r = -EINVAL;
+        int key_string_len = strlen(key);
-        if (cc->key_size && cc->key_size != key_size)
+        /* The key size may not be changed. */
-                return -EINVAL;
+        if (cc->key_size != (key_string_len >> 1))
+                goto out;
-        cc->key_size = key_size; /* initial settings */
+        /* Hyphen (which gives a key_size of zero) means there is no key. */
+        if (!cc->key_size && strcmp(key, "-"))
+                goto out;
-        if ((!key_size && strcmp(key, "-")) ||
+        if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
-           (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
+                goto out;
-                return -EINVAL;
        set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
-        return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+        r = crypt_setkey_allcpus(cc);
+out:
+        /* Hex key string not needed after here, so wipe it. */
+        memset(key, '0', key_string_len);
+        return r;
 }
 static int crypt_wipe_key(struct crypt_config *cc)
 {
        clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
        memset(&cc->key, 0, cc->key_size * sizeof(u8));
-        return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+        return crypt_setkey_allcpus(cc);
 }
 static void crypt_dtr(struct dm_target *ti)
 {
        struct crypt_config *cc = ti->private;
+        struct crypt_cpu *cpu_cc;
+        int cpu;
        ti->private = NULL;
@@ -1010,6 +1373,14 @@ static void crypt_dtr(struct dm_target *ti)
        if (cc->crypt_queue)
                destroy_workqueue(cc->crypt_queue);
+        if (cc->cpu)
+                for_each_possible_cpu(cpu) {
+                        cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+                        if (cpu_cc->req)
+                                mempool_free(cpu_cc->req, cc->req_pool);
+                        crypt_free_tfms(cc, cpu);
+                }
        if (cc->bs)
                bioset_free(cc->bs);
@@ -1023,14 +1394,14 @@ static void crypt_dtr(struct dm_target *ti)
        if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
                cc->iv_gen_ops->dtr(cc);
-        if (cc->tfm && !IS_ERR(cc->tfm))
-                crypto_free_ablkcipher(cc->tfm);
        if (cc->dev)
                dm_put_device(ti, cc->dev);
+        if (cc->cpu)
+                free_percpu(cc->cpu);
        kzfree(cc->cipher);
-        kzfree(cc->cipher_mode);
+        kzfree(cc->cipher_string);
        /* Must zero key material before freeing */
        kzfree(cc);
@@ -1040,9 +1411,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                            char *cipher_in, char *key)
 {
        struct crypt_config *cc = ti->private;
-        char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
+        char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
        char *cipher_api = NULL;
-        int ret = -EINVAL;
+        int cpu, ret = -EINVAL;
        /* Convert to crypto api definition? */
        if (strchr(cipher_in, '(')) {
@@ -1050,23 +1421,31 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                return -EINVAL;
        }
+        cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+        if (!cc->cipher_string)
+                goto bad_mem;
        /*
         * Legacy dm-crypt cipher specification
-         * cipher-mode-iv:ivopts
+         * cipher[:keycount]-mode-iv:ivopts
         */
        tmp = cipher_in;
-        cipher = strsep(&tmp, "-");
+        keycount = strsep(&tmp, "-");
+        cipher = strsep(&keycount, ":");
+        if (!keycount)
+                cc->tfms_count = 1;
+        else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
+                 !is_power_of_2(cc->tfms_count)) {
+                ti->error = "Bad cipher key count specification";
+                return -EINVAL;
+        }
+        cc->key_parts = cc->tfms_count;
        cc->cipher = kstrdup(cipher, GFP_KERNEL);
        if (!cc->cipher)
                goto bad_mem;
-        if (tmp) {
-                cc->cipher_mode = kstrdup(tmp, GFP_KERNEL);
-                if (!cc->cipher_mode)
-                        goto bad_mem;
-        }
        chainmode = strsep(&tmp, "-");
        ivopts = strsep(&tmp, "-");
        ivmode = strsep(&ivopts, ":");
@@ -1074,10 +1453,19 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        if (tmp)
                DMWARN("Ignoring unexpected additional cipher options");
-        /* Compatibility mode for old dm-crypt mappings */
+        cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) +
+                                 cc->tfms_count * sizeof(*(cc->cpu->tfms)),
+                                 __alignof__(struct crypt_cpu));
+        if (!cc->cpu) {
+                ti->error = "Cannot allocate per cpu state";
+                goto bad_mem;
+        }
+        /*
+         * For compatibility with the original dm-crypt mapping format, if
+         * only the cipher name is supplied, use cbc-plain.
+         */
        if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
-                kfree(cc->cipher_mode);
-                cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL);
                chainmode = "cbc";
                ivmode = "plain";
        }
@@ -1099,11 +1487,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        }
        /* Allocate cipher */
-        cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
+        for_each_possible_cpu(cpu) {
-        if (IS_ERR(cc->tfm)) {
+                ret = crypt_alloc_tfms(cc, cpu, cipher_api);
-                ret = PTR_ERR(cc->tfm);
+                if (ret < 0) {
-                ti->error = "Error allocating crypto tfm";
+                        ti->error = "Error allocating crypto tfm";
-                goto bad;
+                        goto bad;
+                }
        }
        /* Initialize and set key */
@@ -1114,7 +1503,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        }
        /* Initialize IV */
-        cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm);
+        cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
        if (cc->iv_size)
                /* at least a 64 bit sector number should fit in our buffer */
                cc->iv_size = max(cc->iv_size,
@@ -1137,7 +1526,15 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                cc->iv_gen_ops = &crypt_iv_benbi_ops;
        else if (strcmp(ivmode, "null") == 0)
                cc->iv_gen_ops = &crypt_iv_null_ops;
-        else {
+        else if (strcmp(ivmode, "lmk") == 0) {
+                cc->iv_gen_ops = &crypt_iv_lmk_ops;
+                /* Version 2 and 3 is recognised according
+                 * to length of provided multi-key string.
+                 * If present (version 3), last key is used as IV seed.
+                 */
+                if (cc->key_size % cc->key_parts)
+                        cc->key_parts++;
+        } else {
                ret = -EINVAL;
                ti->error = "Invalid IV mode";
                goto bad;
@@ -1194,6 +1591,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                ti->error = "Cannot allocate encryption context";
                return -ENOMEM;
        }
+        cc->key_size = key_size;
        ti->private = cc;
        ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
@@ -1208,9 +1606,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
        cc->dmreq_start = sizeof(struct ablkcipher_request);
-        cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm);
+        cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
        cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
-        cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) &
+        cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
                           ~(crypto_tfm_ctx_alignment() - 1);
        cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1219,7 +1617,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                ti->error = "Cannot allocate crypt request mempool";
                goto bad;
        }
-        cc->req = NULL;
        cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
        if (!cc->page_pool) {
@@ -1252,13 +1649,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        cc->start = tmpll;
        ret = -ENOMEM;
-        cc->io_queue = create_singlethread_workqueue("kcryptd_io");
+        cc->io_queue = alloc_workqueue("kcryptd_io",
+                                       WQ_NON_REENTRANT|
+                                       WQ_MEM_RECLAIM,
+                                       1);
        if (!cc->io_queue) {
                ti->error = "Couldn't create kcryptd io queue";
                goto bad;
        }
-        cc->crypt_queue = create_singlethread_workqueue("kcryptd");
+        cc->crypt_queue = alloc_workqueue("kcryptd",
+                                          WQ_NON_REENTRANT|
+                                          WQ_CPU_INTENSIVE|
+                                          WQ_MEM_RECLAIM,
+                                          1);
        if (!cc->crypt_queue) {
                ti->error = "Couldn't create kcryptd queue";
                goto bad;
@@ -1278,7 +1682,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
        struct dm_crypt_io *io;
        struct crypt_config *cc;
-        if (unlikely(bio_empty_barrier(bio))) {
+        if (bio->bi_rw & REQ_FLUSH) {
                cc = ti->private;
                bio->bi_bdev = cc->dev->bdev;
                return DM_MAPIO_REMAPPED;
@@ -1286,9 +1690,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
        io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
-        if (bio_data_dir(io->base_bio) == READ)
+        if (bio_data_dir(io->base_bio) == READ) {
-                kcryptd_queue_io(io);
+                if (kcryptd_io_read(io, GFP_NOWAIT))
-        else
+                        kcryptd_queue_io(io);
+        } else
                kcryptd_queue_crypt(io);
        return DM_MAPIO_SUBMITTED;
@@ -1306,10 +1711,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
                break;
        case STATUSTYPE_TABLE:
-                if (cc->cipher_mode)
+                DMEMIT("%s ", cc->cipher_string);
-                        DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode);
-                else
-                        DMEMIT("%s ", cc->cipher);
                if (cc->key_size > 0) {
                        if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -1421,7 +1823,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 static struct target_type crypt_target = {
        .name   = "crypt",
-        .version = {1, 7, 0},
+        .version = {1, 10, 0},
        .module = THIS_MODULE,
        .ctr    = crypt_ctr,
        .dtr    = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index baa11912cc94..f18375dcedd9 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -352,7 +352,7 @@ static int __init dm_delay_init(void)
 {
        int r = -ENOMEM;
-        kdelayd_wq = create_workqueue("kdelayd");
+        kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
        if (!kdelayd_wq) {
                DMERR("Couldn't start kdelayd");
                goto bad_queue;
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
new file mode 100644
index 000000000000..ea790623c30b
--- /dev/null
+++ b/drivers/md/dm-flakey.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2003 Sistina Software (UK) Limited.
+ * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#define DM_MSG_PREFIX "flakey"
+/*
+ * Flakey: Used for testing only, simulates intermittent,
+ * catastrophic device failure.
+ */
+struct flakey_c {
+        struct dm_dev *dev;
+        unsigned long start_time;
+        sector_t start;
+        unsigned up_interval;
+        unsigned down_interval;
+};
+/*
+ * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval>
+ */
+static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+        struct flakey_c *fc;
+        unsigned long long tmp;
+        if (argc != 4) {
+                ti->error = "dm-flakey: Invalid argument count";
+                return -EINVAL;
+        }
+        fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+        if (!fc) {
+                ti->error = "dm-flakey: Cannot allocate linear context";
+                return -ENOMEM;
+        }
+        fc->start_time = jiffies;
+        if (sscanf(argv[1], "%llu", &tmp) != 1) {
+                ti->error = "dm-flakey: Invalid device sector";
+                goto bad;
+        }
+        fc->start = tmp;
+        if (sscanf(argv[2], "%u", &fc->up_interval) != 1) {
+                ti->error = "dm-flakey: Invalid up interval";
+                goto bad;
+        }
+        if (sscanf(argv[3], "%u", &fc->down_interval) != 1) {
+                ti->error = "dm-flakey: Invalid down interval";
+                goto bad;
+        }
+        if (!(fc->up_interval + fc->down_interval)) {
+                ti->error = "dm-flakey: Total (up + down) interval is zero";
+                goto bad;
+        }
+        if (fc->up_interval + fc->down_interval < fc->up_interval) {
+                ti->error = "dm-flakey: Interval overflow";
+                goto bad;
+        }
+        if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) {
+                ti->error = "dm-flakey: Device lookup failed";
+                goto bad;
+        }
+        ti->num_flush_requests = 1;
+        ti->private = fc;
+        return 0;
+bad:
+        kfree(fc);
+        return -EINVAL;
+}
+static void flakey_dtr(struct dm_target *ti)
+{
+        struct flakey_c *fc = ti->private;
+        dm_put_device(ti, fc->dev);
+        kfree(fc);
+}
+static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector)
+{
+        struct flakey_c *fc = ti->private;
+        return fc->start + (bi_sector - ti->begin);
+}
+static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
+{
+        struct flakey_c *fc = ti->private;
+        bio->bi_bdev = fc->dev->bdev;
+        if (bio_sectors(bio))
+                bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
+}
+static int flakey_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
+{
+        struct flakey_c *fc = ti->private;
+        unsigned elapsed;
+        /* Are we alive ? */
+        elapsed = (jiffies - fc->start_time) / HZ;
+        if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval)
+                return -EIO;
+        flakey_map_bio(ti, bio);
+        return DM_MAPIO_REMAPPED;
+}
+static int flakey_status(struct dm_target *ti, status_type_t type,
+                         char *result, unsigned int maxlen)
+{
+        struct flakey_c *fc = ti->private;
+        switch (type) {
+        case STATUSTYPE_INFO:
+                result[0] = '\0';
+                break;
+        case STATUSTYPE_TABLE:
+                snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name,
+                         (unsigned long long)fc->start, fc->up_interval,
+                         fc->down_interval);
+                break;
+        }
+        return 0;
+}
+static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
+{
+        struct flakey_c *fc = ti->private;
+        return __blkdev_driver_ioctl(fc->dev->bdev, fc->dev->mode, cmd, arg);
+}
+static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+                        struct bio_vec *biovec, int max_size)
+{
+        struct flakey_c *fc = ti->private;
+        struct request_queue *q = bdev_get_queue(fc->dev->bdev);
+        if (!q->merge_bvec_fn)
+                return max_size;
+        bvm->bi_bdev = fc->dev->bdev;
+        bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector);
+        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+        struct flakey_c *fc = ti->private;
+        return fn(ti, fc->dev, fc->start, ti->len, data);
+}
+static struct target_type flakey_target = {
+        .name   = "flakey",
+        .version = {1, 1, 0},
+        .module = THIS_MODULE,
+        .ctr    = flakey_ctr,
+        .dtr    = flakey_dtr,
+        .map    = flakey_map,
+        .status = flakey_status,
+        .ioctl  = flakey_ioctl,
+        .merge  = flakey_merge,
+        .iterate_devices = flakey_iterate_devices,
+};
+static int __init dm_flakey_init(void)
+{
+        int r = dm_register_target(&flakey_target);
+        if (r < 0)
+                DMERR("register failed %d", r);
+        return r;
+}
+static void __exit dm_flakey_exit(void)
+{
+        dm_unregister_target(&flakey_target);
+}
+/* Module hooks */
+module_init(dm_flakey_init);
+module_exit(dm_flakey_exit);
+MODULE_DESCRIPTION(DM_NAME " flakey target");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 0590c75b0ab6..2067288f61f9 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -19,6 +19,8 @@
 #define DM_MSG_PREFIX "io"
 #define DM_IO_MAX_REGIONS       BITS_PER_LONG
+#define MIN_IOS         16
+#define MIN_BIOS        16
 struct dm_io_client {
        mempool_t *pool;
@@ -31,7 +33,6 @@ struct dm_io_client {
 */
 struct io {
        unsigned long error_bits;
-        unsigned long eopnotsupp_bits;
        atomic_t count;
        struct task_struct *sleeper;
        struct dm_io_client *client;
@@ -42,33 +43,21 @@ struct io {
 static struct kmem_cache *_dm_io_cache;
 /*
- * io contexts are only dynamically allocated for asynchronous
- * io.  Since async io is likely to be the majority of io we'll
- * have the same number of io contexts as bios! (FIXME: must reduce this).
- */
-static unsigned int pages_to_ios(unsigned int pages)
-{
-        return 4 * pages;       /* too many ? */
-}
-/*
 * Create a client with mempool and bioset.
 */
-struct dm_io_client *dm_io_client_create(unsigned num_pages)
+struct dm_io_client *dm_io_client_create(void)
 {
-        unsigned ios = pages_to_ios(num_pages);
        struct dm_io_client *client;
        client = kmalloc(sizeof(*client), GFP_KERNEL);
        if (!client)
                return ERR_PTR(-ENOMEM);
-        client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
+        client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
        if (!client->pool)
                goto bad;
-        client->bios = bioset_create(16, 0);
+        client->bios = bioset_create(MIN_BIOS, 0);
        if (!client->bios)
                goto bad;
@@ -82,13 +71,6 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
 }
 EXPORT_SYMBOL(dm_io_client_create);
-int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client)
-{
-        return mempool_resize(client->pool, pages_to_ios(num_pages),
-                              GFP_KERNEL);
-}
-EXPORT_SYMBOL(dm_io_client_resize);
 void dm_io_client_destroy(struct dm_io_client *client)
 {
        mempool_destroy(client->pool);
@@ -130,11 +112,8 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
 *---------------------------------------------------------------*/
 static void dec_count(struct io *io, unsigned int region, int error)
 {
-        if (error) {
+        if (error)
                set_bit(region, &io->error_bits);
-                if (error == -EOPNOTSUPP)
-                        set_bit(region, &io->eopnotsupp_bits);
-        }
        if (atomic_dec_and_test(&io->count)) {
                if (io->sleeper)
@@ -310,8 +289,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
        sector_t remaining = where->count;
        /*
-         * where->count may be zero if rw holds a write barrier and we
+         * where->count may be zero if rw holds a flush and we need to
-         * need to send a zero-sized barrier.
+         * send a zero-sized flush.
         */
        do {
                /*
@@ -356,7 +335,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
        BUG_ON(num_regions > DM_IO_MAX_REGIONS);
        if (sync)
-                rw |= REQ_SYNC | REQ_UNPLUG;
+                rw |= REQ_SYNC;
        /*
         * For multiple regions we need to be careful to rewind
@@ -364,7 +343,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
         */
        for (i = 0; i < num_regions; i++) {
                *dp = old_pages;
-                if (where[i].count || (rw & REQ_HARDBARRIER))
+                if (where[i].count || (rw & REQ_FLUSH))
                        do_region(rw, i, where + i, dp, io);
        }
@@ -393,9 +372,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
                return -EIO;
        }
-retry:
        io->error_bits = 0;
-        io->eopnotsupp_bits = 0;
        atomic_set(&io->count, 1); /* see dispatch_io() */
        io->sleeper = current;
        io->client = client;
@@ -412,11 +389,6 @@ retry:
        }
        set_current_state(TASK_RUNNING);
-        if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
-                rw &= ~REQ_HARDBARRIER;
-                goto retry;
-        }
        if (error_bits)
                *error_bits = io->error_bits;
@@ -437,7 +409,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
        io = mempool_alloc(client->pool, GFP_NOIO);
        io->error_bits = 0;
-        io->eopnotsupp_bits = 0;
        atomic_set(&io->count, 1); /* see dispatch_io() */
        io->sleeper = NULL;
        io->client = client;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 3e39193e5036..4cacdad2270a 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -295,19 +295,55 @@ retry:
                DMWARN("remove_all left %d open device(s)", dev_skipped);
 }
+/*
+ * Set the uuid of a hash_cell that isn't already set.
+ */
+static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
+{
+        mutex_lock(&dm_hash_cells_mutex);
+        hc->uuid = new_uuid;
+        mutex_unlock(&dm_hash_cells_mutex);
+        list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
+}
+/*
+ * Changes the name of a hash_cell and returns the old name for
+ * the caller to free.
+ */
+static char *__change_cell_name(struct hash_cell *hc, char *new_name)
+{
+        char *old_name;
+        /*
+         * Rename and move the name cell.
+         */
+        list_del(&hc->name_list);
+        old_name = hc->name;
+        mutex_lock(&dm_hash_cells_mutex);
+        hc->name = new_name;
+        mutex_unlock(&dm_hash_cells_mutex);
+        list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+        return old_name;
+}
 static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
                                            const char *new)
 {
-        char *new_name, *old_name;
+        char *new_data, *old_name = NULL;
        struct hash_cell *hc;
        struct dm_table *table;
        struct mapped_device *md;
+        unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
        /*
         * duplicate new.
         */
-        new_name = kstrdup(new, GFP_KERNEL);
+        new_data = kstrdup(new, GFP_KERNEL);
-        if (!new_name)
+        if (!new_data)
                return ERR_PTR(-ENOMEM);
        down_write(&_hash_lock);
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
        /*
         * Is new free ?
         */
-        hc = __get_name_cell(new);
+        if (change_uuid)
+                hc = __get_uuid_cell(new);
+        else
+                hc = __get_name_cell(new);
        if (hc) {
-                DMWARN("asked to rename to an already-existing name %s -> %s",
+                DMWARN("Unable to change %s on mapped device %s to one that "
+                       "already exists: %s",
+                       change_uuid ? "uuid" : "name",
                       param->name, new);
                dm_put(hc->md);
                up_write(&_hash_lock);
-                kfree(new_name);
+                kfree(new_data);
                return ERR_PTR(-EBUSY);
        }
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
         */
        hc = __get_name_cell(param->name);
        if (!hc) {
-                DMWARN("asked to rename a non-existent device %s -> %s",
+                DMWARN("Unable to rename non-existent device, %s to %s%s",
-                       param->name, new);
+                       param->name, change_uuid ? "uuid " : "", new);
                up_write(&_hash_lock);
-                kfree(new_name);
+                kfree(new_data);
                return ERR_PTR(-ENXIO);
        }
        /*
-         * rename and move the name cell.
+         * Does this device already have a uuid?
         */
-        list_del(&hc->name_list);
+        if (change_uuid && hc->uuid) {
-        old_name = hc->name;
+                DMWARN("Unable to change uuid of mapped device %s to %s "
-        mutex_lock(&dm_hash_cells_mutex);
+                       "because uuid is already set to %s",
-        hc->name = new_name;
+                       param->name, new, hc->uuid);
-        mutex_unlock(&dm_hash_cells_mutex);
+                dm_put(hc->md);
-        list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+                up_write(&_hash_lock);
+                kfree(new_data);
+                return ERR_PTR(-EINVAL);
+        }
+        if (change_uuid)
+                __set_cell_uuid(hc, new_data);
+        else
+                old_name = __change_cell_name(hc, new_data);
        /*
         * Wake up any dm event waiters.
@@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
        hc = __find_device_hash_cell(param);
        if (!hc) {
-                DMWARN("device doesn't appear to be in the dev hash table.");
+                DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
                up_write(&_hash_lock);
                return -ENXIO;
        }
@@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
         */
        r = dm_lock_for_deletion(md);
        if (r) {
-                DMWARN("unable to remove open device %s", hc->name);
+                DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
                up_write(&_hash_lock);
                dm_put(md);
                return r;
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end)
 static int dev_rename(struct dm_ioctl *param, size_t param_size)
 {
        int r;
-        char *new_name = (char *) param + param->data_start;
+        char *new_data = (char *) param + param->data_start;
        struct mapped_device *md;
+        unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
-        if (new_name < param->data ||
+        if (new_data < param->data ||
-            invalid_str(new_name, (void *) param + param_size) ||
+            invalid_str(new_data, (void *) param + param_size) ||
-            strlen(new_name) > DM_NAME_LEN - 1) {
+            strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
-                DMWARN("Invalid new logical volume name supplied.");
+                DMWARN("Invalid new mapped device name or uuid string supplied.");
                return -EINVAL;
        }
-        r = check_name(new_name);
+        if (!change_uuid) {
-        if (r)
+                r = check_name(new_data);
-                return r;
+                if (r)
+                        return r;
+        }
-        md = dm_hash_rename(param, new_name);
+        md = dm_hash_rename(param, new_data);
        if (IS_ERR(md))
                return PTR_ERR(md);
@@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param)
        hc = __find_device_hash_cell(param);
        if (!hc) {
-                DMWARN("device doesn't appear to be in the dev hash table.");
+                DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
                up_write(&_hash_lock);
                return -ENXIO;
        }
@@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
        hc = __find_device_hash_cell(param);
        if (!hc) {
-                DMWARN("device doesn't appear to be in the dev hash table.");
+                DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
                up_write(&_hash_lock);
                return -ENXIO;
        }
@@ -1448,14 +1501,10 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
        return r;
 }
-static void free_params(struct dm_ioctl *param)
-{
-        vfree(param);
-}
 static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
 {
        struct dm_ioctl tmp, *dmi;
+        int secure_data;
        if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data)))
                return -EFAULT;
@@ -1463,17 +1512,30 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
        if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data)))
                return -EINVAL;
+        secure_data = tmp.flags & DM_SECURE_DATA_FLAG;
        dmi = vmalloc(tmp.data_size);
-        if (!dmi)
+        if (!dmi) {
+                if (secure_data && clear_user(user, tmp.data_size))
+                        return -EFAULT;
                return -ENOMEM;
-        if (copy_from_user(dmi, user, tmp.data_size)) {
-                vfree(dmi);
-                return -EFAULT;
        }
+        if (copy_from_user(dmi, user, tmp.data_size))
+                goto bad;
+        /* Wipe the user buffer so we do not return it to userspace */
+        if (secure_data && clear_user(user, tmp.data_size))
+                goto bad;
        *param = dmi;
        return 0;
+bad:
+        if (secure_data)
+                memset(dmi, 0, tmp.data_size);
+        vfree(dmi);
+        return -EFAULT;
 }
 static int validate_params(uint cmd, struct dm_ioctl *param)
@@ -1481,6 +1543,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
        /* Always clear this flag */
        param->flags &= ~DM_BUFFER_FULL_FLAG;
        param->flags &= ~DM_UEVENT_GENERATED_FLAG;
+        param->flags &= ~DM_SECURE_DATA_FLAG;
        /* Ignores parameters */
        if (cmd == DM_REMOVE_ALL_CMD ||
@@ -1508,10 +1571,11 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 {
        int r = 0;
+        int wipe_buffer;
        unsigned int cmd;
        struct dm_ioctl *uninitialized_var(param);
        ioctl_fn fn = NULL;
-        size_t param_size;
+        size_t input_param_size;
        /* only root can play with this */
        if (!capable(CAP_SYS_ADMIN))
@@ -1558,13 +1622,15 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
        if (r)
                return r;
+        input_param_size = param->data_size;
+        wipe_buffer = param->flags & DM_SECURE_DATA_FLAG;
        r = validate_params(cmd, param);
        if (r)
                goto out;
-        param_size = param->data_size;
        param->data_size = sizeof(*param);
-        r = fn(param, param_size);
+        r = fn(param, input_param_size);
        /*
         * Copy the results back to userland.
@@ -1572,8 +1638,11 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
        if (!r && copy_to_user(user, param, param->data_size))
                r = -EFAULT;
- out:
+out:
-        free_params(param);
+        if (wipe_buffer)
+                memset(param, 0, input_param_size);
+        vfree(param);
        return r;
 }
@@ -1596,6 +1665,7 @@ static const struct file_operations _ctl_fops = {
        .unlocked_ioctl  = dm_ctl_ioctl,
        .compat_ioctl = dm_compat_ctl_ioctl,
        .owner   = THIS_MODULE,
+        .llseek  = noop_llseek,
 };
 static struct miscdevice _dm_misc = {
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d8587bac5682..819e37eaaeba 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -27,15 +27,19 @@
 #include "dm.h"
+#define SUB_JOB_SIZE    128
+#define SPLIT_COUNT     8
+#define MIN_JOBS        8
+#define RESERVE_PAGES   (DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE))
 /*-----------------------------------------------------------------
 * Each kcopyd client has its own little pool of preallocated
 * pages for kcopyd io.
 *---------------------------------------------------------------*/
 struct dm_kcopyd_client {
-        spinlock_t lock;
        struct page_list *pages;
-        unsigned int nr_pages;
+        unsigned nr_reserved_pages;
-        unsigned int nr_free_pages;
+        unsigned nr_free_pages;
        struct dm_io_client *io_client;
@@ -67,15 +71,18 @@ static void wake(struct dm_kcopyd_client *kc)
        queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
 }
-static struct page_list *alloc_pl(void)
+/*
+ * Obtain one page for the use of kcopyd.
+ */
+static struct page_list *alloc_pl(gfp_t gfp)
 {
        struct page_list *pl;
-        pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+        pl = kmalloc(sizeof(*pl), gfp);
        if (!pl)
                return NULL;
-        pl->page = alloc_page(GFP_KERNEL);
+        pl->page = alloc_page(gfp);
        if (!pl->page) {
                kfree(pl);
                return NULL;
@@ -90,41 +97,56 @@ static void free_pl(struct page_list *pl)
        kfree(pl);
 }
-static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
+/*
-                            unsigned int nr, struct page_list **pages)
+ * Add the provided pages to a client's free page list, releasing
+ * back to the system any beyond the reserved_pages limit.
+ */
+static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl)
 {
-        struct page_list *pl;
+        struct page_list *next;
-        spin_lock(&kc->lock);
-        if (kc->nr_free_pages < nr) {
-                spin_unlock(&kc->lock);
-                return -ENOMEM;
-        }
-        kc->nr_free_pages -= nr;
-        for (*pages = pl = kc->pages; --nr; pl = pl->next)
-                ;
-        kc->pages = pl->next;
+        do {
-        pl->next = NULL;
+                next = pl->next;
-        spin_unlock(&kc->lock);
+                if (kc->nr_free_pages >= kc->nr_reserved_pages)
+                        free_pl(pl);
+                else {
+                        pl->next = kc->pages;
+                        kc->pages = pl;
+                        kc->nr_free_pages++;
+                }
-        return 0;
+                pl = next;
+        } while (pl);
 }
-static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl)
+static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
+                            unsigned int nr, struct page_list **pages)
 {
-        struct page_list *cursor;
+        struct page_list *pl;
+        *pages = NULL;
+        do {
+                pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY);
+                if (unlikely(!pl)) {
+                        /* Use reserved pages */
+                        pl = kc->pages;
+                        if (unlikely(!pl))
+                                goto out_of_memory;
+                        kc->pages = pl->next;
+                        kc->nr_free_pages--;
+                }
+                pl->next = *pages;
+                *pages = pl;
+        } while (--nr);
-        spin_lock(&kc->lock);
+        return 0;
-        for (cursor = pl; cursor->next; cursor = cursor->next)
-                kc->nr_free_pages++;
-        kc->nr_free_pages++;
+out_of_memory:
-        cursor->next = kc->pages;
+        if (*pages)
-        kc->pages = pl;
+                kcopyd_put_pages(kc, *pages);
-        spin_unlock(&kc->lock);
+        return -ENOMEM;
 }
 /*
@@ -141,13 +163,16 @@ static void drop_pages(struct page_list *pl)
        }
 }
-static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr)
+/*
+ * Allocate and reserve nr_pages for the use of a specific client.
+ */
+static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned nr_pages)
 {
-        unsigned int i;
+        unsigned i;
        struct page_list *pl = NULL, *next;
-        for (i = 0; i < nr; i++) {
+        for (i = 0; i < nr_pages; i++) {
-                next = alloc_pl();
+                next = alloc_pl(GFP_KERNEL);
                if (!next) {
                        if (pl)
                                drop_pages(pl);
@@ -157,17 +182,18 @@ static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr)
                pl = next;
        }
+        kc->nr_reserved_pages += nr_pages;
        kcopyd_put_pages(kc, pl);
-        kc->nr_pages += nr;
        return 0;
 }
 static void client_free_pages(struct dm_kcopyd_client *kc)
 {
-        BUG_ON(kc->nr_free_pages != kc->nr_pages);
+        BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages);
        drop_pages(kc->pages);
        kc->pages = NULL;
-        kc->nr_free_pages = kc->nr_pages = 0;
+        kc->nr_free_pages = kc->nr_reserved_pages = 0;
 }
 /*-----------------------------------------------------------------
@@ -216,16 +242,17 @@ struct kcopyd_job {
        struct mutex lock;
        atomic_t sub_jobs;
        sector_t progress;
-};
-/* FIXME: this should scale with the number of pages */
+        struct kcopyd_job *master_job;
-#define MIN_JOBS 512
+};
 static struct kmem_cache *_job_cache;
 int __init dm_kcopyd_init(void)
 {
-        _job_cache = KMEM_CACHE(kcopyd_job, 0);
+        _job_cache = kmem_cache_create("kcopyd_job",
+                                sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1),
+                                __alignof__(struct kcopyd_job), 0, NULL);
        if (!_job_cache)
                return -ENOMEM;
@@ -299,7 +326,12 @@ static int run_complete_job(struct kcopyd_job *job)
        if (job->pages)
                kcopyd_put_pages(kc, job->pages);
-        mempool_free(job, kc->job_pool);
+        /*
+         * If this is the master job, the sub jobs have already
+         * completed so we can free everything.
+         */
+        if (job->master_job == job)
+                mempool_free(job, kc->job_pool);
        fn(read_err, write_err, context);
        if (atomic_dec_and_test(&kc->nr_jobs))
@@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
        int r;
        struct dm_io_request io_req = {
-                .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG,
+                .bi_rw = job->rw,
                .mem.type = DM_IO_PAGE_LIST,
                .mem.ptr.pl = job->pages,
                .mem.offset = job->offset,
@@ -428,6 +460,7 @@ static void do_work(struct work_struct *work)
 {
        struct dm_kcopyd_client *kc = container_of(work,
                                        struct dm_kcopyd_client, kcopyd_work);
+        struct blk_plug plug;
        /*
         * The order that these are called is *very* important.
@@ -436,9 +469,11 @@ static void do_work(struct work_struct *work)
         * list.  io jobs call wake when they complete and it all
         * starts again.
         */
+        blk_start_plug(&plug);
        process_jobs(&kc->complete_jobs, kc, run_complete_job);
        process_jobs(&kc->pages_jobs, kc, run_pages_job);
        process_jobs(&kc->io_jobs, kc, run_io_job);
+        blk_finish_plug(&plug);
 }
 /*
@@ -457,14 +492,14 @@ static void dispatch_job(struct kcopyd_job *job)
        wake(kc);
 }
-#define SUB_JOB_SIZE 128
 static void segment_complete(int read_err, unsigned long write_err,
                             void *context)
 {
        /* FIXME: tidy this function */
        sector_t progress = 0;
        sector_t count = 0;
-        struct kcopyd_job *job = (struct kcopyd_job *) context;
+        struct kcopyd_job *sub_job = (struct kcopyd_job *) context;
+        struct kcopyd_job *job = sub_job->master_job;
        struct dm_kcopyd_client *kc = job->kc;
        mutex_lock(&job->lock);
@@ -495,8 +530,6 @@ static void segment_complete(int read_err, unsigned long write_err,
        if (count) {
                int i;
-                struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool,
-                                                           GFP_NOIO);
                *sub_job = *job;
                sub_job->source.sector += progress;
@@ -508,7 +541,7 @@ static void segment_complete(int read_err, unsigned long write_err,
                }
                sub_job->fn = segment_complete;
-                sub_job->context = job;
+                sub_job->context = sub_job;
                dispatch_job(sub_job);
        } else if (atomic_dec_and_test(&job->sub_jobs)) {
@@ -528,19 +561,19 @@ static void segment_complete(int read_err, unsigned long write_err,
 }
 /*
- * Create some little jobs that will do the move between
+ * Create some sub jobs to share the work between them.
- * them.
 */
-#define SPLIT_COUNT 8
+static void split_job(struct kcopyd_job *master_job)
-static void split_job(struct kcopyd_job *job)
 {
        int i;
-        atomic_inc(&job->kc->nr_jobs);
+        atomic_inc(&master_job->kc->nr_jobs);
-        atomic_set(&job->sub_jobs, SPLIT_COUNT);
+        atomic_set(&master_job->sub_jobs, SPLIT_COUNT);
-        for (i = 0; i < SPLIT_COUNT; i++)
+        for (i = 0; i < SPLIT_COUNT; i++) {
-                segment_complete(0, 0u, job);
+                master_job[i + 1].master_job = master_job;
+                segment_complete(0, 0u, &master_job[i + 1]);
+        }
 }
 int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
@@ -550,7 +583,8 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
        struct kcopyd_job *job;
        /*
-         * Allocate a new job.
+         * Allocate an array of jobs consisting of one master job
+         * followed by SPLIT_COUNT sub jobs.
         */
        job = mempool_alloc(kc->job_pool, GFP_NOIO);
@@ -574,10 +608,10 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
        job->fn = fn;
        job->context = context;
+        job->master_job = job;
-        if (job->source.count < SUB_JOB_SIZE)
+        if (job->source.count <= SUB_JOB_SIZE)
                dispatch_job(job);
        else {
                mutex_init(&job->lock);
                job->progress = 0;
@@ -603,17 +637,15 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
 /*-----------------------------------------------------------------
 * Client setup
 *---------------------------------------------------------------*/
-int dm_kcopyd_client_create(unsigned int nr_pages,
+struct dm_kcopyd_client *dm_kcopyd_client_create(void)
-                            struct dm_kcopyd_client **result)
 {
        int r = -ENOMEM;
        struct dm_kcopyd_client *kc;
        kc = kmalloc(sizeof(*kc), GFP_KERNEL);
        if (!kc)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-        spin_lock_init(&kc->lock);
        spin_lock_init(&kc->job_lock);
        INIT_LIST_HEAD(&kc->complete_jobs);
        INIT_LIST_HEAD(&kc->io_jobs);
@@ -624,17 +656,18 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
                goto bad_slab;
        INIT_WORK(&kc->kcopyd_work, do_work);
-        kc->kcopyd_wq = create_singlethread_workqueue("kcopyd");
+        kc->kcopyd_wq = alloc_workqueue("kcopyd",
+                                        WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
        if (!kc->kcopyd_wq)
                goto bad_workqueue;
        kc->pages = NULL;
-        kc->nr_pages = kc->nr_free_pages = 0;
+        kc->nr_reserved_pages = kc->nr_free_pages = 0;
-        r = client_alloc_pages(kc, nr_pages);
+        r = client_reserve_pages(kc, RESERVE_PAGES);
        if (r)
                goto bad_client_pages;
-        kc->io_client = dm_io_client_create(nr_pages);
+        kc->io_client = dm_io_client_create();
        if (IS_ERR(kc->io_client)) {
                r = PTR_ERR(kc->io_client);
                goto bad_io_client;
@@ -643,8 +676,7 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
        init_waitqueue_head(&kc->destroyq);
        atomic_set(&kc->nr_jobs, 0);
-        *result = kc;
+        return kc;
-        return 0;
 bad_io_client:
        client_free_pages(kc);
@@ -655,7 +687,7 @@ bad_workqueue:
 bad_slab:
        kfree(kc);
-        return r;
+        return ERR_PTR(r);
 }
 EXPORT_SYMBOL(dm_kcopyd_client_create);
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 1ed0094f064b..aa2e0c374ab3 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -12,12 +12,22 @@
 #include "dm-log-userspace-transfer.h"
+#define DM_LOG_USERSPACE_VSN "1.1.0"
 struct flush_entry {
        int type;
        region_t region;
        struct list_head list;
 };
+/*
+ * This limit on the number of mark and clear request is, to a degree,
+ * arbitrary.  However, there is some basis for the choice in the limits
+ * imposed on the size of data payload by dm-log-userspace-transfer.c:
+ * dm_consult_userspace().
+ */
+#define MAX_FLUSH_GROUP_COUNT 32
 struct log_c {
        struct dm_target *ti;
        uint32_t region_size;
@@ -37,8 +47,15 @@ struct log_c {
         */
        uint64_t in_sync_hint;
+        /*
+         * Mark and clear requests are held until a flush is issued
+         * so that we can group, and thereby limit, the amount of
+         * network traffic between kernel and userspace.  The 'flush_lock'
+         * is used to protect these lists.
+         */
        spinlock_t flush_lock;
-        struct list_head flush_list;  /* only for clear and mark requests */
+        struct list_head mark_list;
+        struct list_head clear_list;
 };
 static mempool_t *flush_entry_pool;
@@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
        strncpy(lc->uuid, argv[0], DM_UUID_LEN);
        spin_lock_init(&lc->flush_lock);
-        INIT_LIST_HEAD(&lc->flush_list);
+        INIT_LIST_HEAD(&lc->mark_list);
+        INIT_LIST_HEAD(&lc->clear_list);
        str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
        if (str_size < 0) {
@@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
                                 ctr_str, str_size, NULL, NULL);
-        if (r == -ESRCH) {
+        if (r < 0) {
-                DMERR("Userspace log server not found");
+                if (r == -ESRCH)
+                        DMERR("Userspace log server not found");
+                else
+                        DMERR("Userspace log server failed to create log");
                goto out;
        }
@@ -214,10 +235,9 @@ out:
 static void userspace_dtr(struct dm_dirty_log *log)
 {
-        int r;
        struct log_c *lc = log->context;
-        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
+        (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
                                 NULL, 0,
                                 NULL, NULL);
@@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
        return (r) ? 0 : (int)in_sync;
 }
+static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
+{
+        int r = 0;
+        struct flush_entry *fe;
+        list_for_each_entry(fe, flush_list, list) {
+                r = userspace_do_request(lc, lc->uuid, fe->type,
+                                         (char *)&fe->region,
+                                         sizeof(fe->region),
+                                         NULL, NULL);
+                if (r)
+                        break;
+        }
+        return r;
+}
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
+{
+        int r = 0;
+        int count;
+        uint32_t type = 0;
+        struct flush_entry *fe, *tmp_fe;
+        LIST_HEAD(tmp_list);
+        uint64_t group[MAX_FLUSH_GROUP_COUNT];
+        /*
+         * Group process the requests
+         */
+        while (!list_empty(flush_list)) {
+                count = 0;
+                list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
+                        group[count] = fe->region;
+                        count++;
+                        list_del(&fe->list);
+                        list_add(&fe->list, &tmp_list);
+                        type = fe->type;
+                        if (count >= MAX_FLUSH_GROUP_COUNT)
+                                break;
+                }
+                r = userspace_do_request(lc, lc->uuid, type,
+                                         (char *)(group),
+                                         count * sizeof(uint64_t),
+                                         NULL, NULL);
+                if (r) {
+                        /* Group send failed.  Attempt one-by-one. */
+                        list_splice_init(&tmp_list, flush_list);
+                        r = flush_one_by_one(lc, flush_list);
+                        break;
+                }
+        }
+        /*
+         * Must collect flush_entrys that were successfully processed
+         * as a group so that they will be free'd by the caller.
+         */
+        list_splice_init(&tmp_list, flush_list);
+        return r;
+}
 /*
 * userspace_flush
 *
@@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log)
        int r = 0;
        unsigned long flags;
        struct log_c *lc = log->context;
-        LIST_HEAD(flush_list);
+        LIST_HEAD(mark_list);
+        LIST_HEAD(clear_list);
        struct flush_entry *fe, *tmp_fe;
        spin_lock_irqsave(&lc->flush_lock, flags);
-        list_splice_init(&lc->flush_list, &flush_list);
+        list_splice_init(&lc->mark_list, &mark_list);
+        list_splice_init(&lc->clear_list, &clear_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
-        if (list_empty(&flush_list))
+        if (list_empty(&mark_list) && list_empty(&clear_list))
                return 0;
-        /*
+        r = flush_by_group(lc, &mark_list);
-         * FIXME: Count up requests, group request types,
+        if (r)
-         * allocate memory to stick all requests in and
+                goto fail;
-         * send to server in one go.  Failing the allocation,
-         * do it one by one.
-         */
-        list_for_each_entry(fe, &flush_list, list) {
+        r = flush_by_group(lc, &clear_list);
-                r = userspace_do_request(lc, lc->uuid, fe->type,
+        if (r)
-                                         (char *)&fe->region,
+                goto fail;
-                                         sizeof(fe->region),
-                                         NULL, NULL);
-                if (r)
-                        goto fail;
-        }
        r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
                                 NULL, 0, NULL, NULL);
@@ -395,7 +474,11 @@ fail:
         * Calling code will receive an error and will know that
         * the log facility has failed.
         */
-        list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+        list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
+                list_del(&fe->list);
+                mempool_free(fe, flush_entry_pool);
+        }
+        list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
                list_del(&fe->list);
                mempool_free(fe, flush_entry_pool);
        }
@@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
        spin_lock_irqsave(&lc->flush_lock, flags);
        fe->type = DM_ULOG_MARK_REGION;
        fe->region = region;
-        list_add(&fe->list, &lc->flush_list);
+        list_add(&fe->list, &lc->mark_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
        return;
@@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
        spin_lock_irqsave(&lc->flush_lock, flags);
        fe->type = DM_ULOG_CLEAR_REGION;
        fe->region = region;
-        list_add(&fe->list, &lc->flush_list);
+        list_add(&fe->list, &lc->clear_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
        return;
@@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void)
                return r;
        }
-        DMINFO("version 1.0.0 loaded");
+        DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
        return 0;
 }
@@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void)
        dm_ulog_tfr_exit();
        mempool_destroy(flush_entry_pool);
-        DMINFO("version 1.0.0 unloaded");
+        DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
        return;
 }
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 075cbcf8a9f5..1f23e048f077 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -134,7 +134,7 @@ static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
 {
        struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
-        if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN))
+        if (!cap_raised(current_cap(), CAP_SYS_ADMIN))
                return;
        spin_lock(&receiving_list_lock);
@@ -198,6 +198,7 @@ resend:
        memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
        memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+        tfr->version = DM_ULOG_REQUEST_VERSION;
        tfr->luid = luid;
        tfr->seq = dm_ulog_seq++;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 5a08be0222db..948e3f4925bf 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -251,20 +251,20 @@ struct log_c {
 */
 static inline int log_test_bit(uint32_t *bs, unsigned bit)
 {
-        return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0;
+        return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0;
 }
 static inline void log_set_bit(struct log_c *l,
                               uint32_t *bs, unsigned bit)
 {
-        ext2_set_bit(bit, (unsigned long *) bs);
+        __test_and_set_bit_le(bit, (unsigned long *) bs);
        l->touched_cleaned = 1;
 }
 static inline void log_clear_bit(struct log_c *l,
                                 uint32_t *bs, unsigned bit)
 {
-        ext2_clear_bit(bit, (unsigned long *) bs);
+        __test_and_clear_bit_le(bit, (unsigned long *) bs);
        l->touched_dirtied = 1;
 }
@@ -300,7 +300,7 @@ static int flush_header(struct log_c *lc)
                .count = 0,
        };
-        lc->io_req.bi_rw = WRITE_BARRIER;
+        lc->io_req.bi_rw = WRITE_FLUSH;
        return dm_io(&lc->io_req, 1, &null_location, NULL);
 }
@@ -449,13 +449,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                lc->io_req.mem.type = DM_IO_VMA;
                lc->io_req.notify.fn = NULL;
-                lc->io_req.client = dm_io_client_create(dm_div_up(buf_size,
+                lc->io_req.client = dm_io_client_create();
-                                                                   PAGE_SIZE));
                if (IS_ERR(lc->io_req.client)) {
                        r = PTR_ERR(lc->io_req.client);
                        DMWARN("couldn't allocate disk io client");
                        kfree(lc);
-                        return -ENOMEM;
+                        return r;
                }
                lc->disk_header = vmalloc(buf_size);
@@ -543,7 +542,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti,
                return -EINVAL;
        }
-        r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &dev);
+        r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev);
        if (r)
                return r;
@@ -740,7 +739,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
                return 0;
        do {
-                *region = ext2_find_next_zero_bit(
+                *region = find_next_zero_bit_le(
                                             (unsigned long *) lc->sync_bits,
                                             lc->region_count,
                                             lc->sync_search);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 487ecda90ad4..aa4e570c2cb5 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -23,6 +23,8 @@
 #define DM_MSG_PREFIX "multipath"
 #define MESG_STR(x) x, sizeof(x)
+#define DM_PG_INIT_DELAY_MSECS 2000
+#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
 /* Path properties */
 struct pgpath {
@@ -33,8 +35,7 @@ struct pgpath {
        unsigned fail_count;            /* Cumulative failure count */
        struct dm_path path;
-        struct work_struct deactivate_path;
+        struct delayed_work activate_path;
-        struct work_struct activate_path;
 };
 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -65,11 +66,15 @@ struct multipath {
        const char *hw_handler_name;
        char *hw_handler_params;
        unsigned nr_priority_groups;
        struct list_head priority_groups;
+        wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
        unsigned pg_init_required;      /* pg_init needs calling? */
        unsigned pg_init_in_progress;   /* Only one pg_init allowed at once */
-        wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+        unsigned pg_init_delay_retry;   /* Delay pg_init retry? */
        unsigned nr_valid_paths;        /* Total number of usable paths */
        struct pgpath *current_pgpath;
@@ -82,6 +87,7 @@ struct multipath {
        unsigned saved_queue_if_no_path;/* Saved state during suspension */
        unsigned pg_init_retries;       /* Number of times to retry pg_init */
        unsigned pg_init_count;         /* Number of times pg_init called */
+        unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
        struct work_struct process_queued_ios;
        struct list_head queued_ios;
@@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
-static void deactivate_path(struct work_struct *work);
 /*-----------------------------------------------
@@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void)
        if (pgpath) {
                pgpath->is_active = 1;
-                INIT_WORK(&pgpath->deactivate_path, deactivate_path);
+                INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
-                INIT_WORK(&pgpath->activate_path, activate_path);
        }
        return pgpath;
@@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath)
        kfree(pgpath);
 }
-static void deactivate_path(struct work_struct *work)
-{
-        struct pgpath *pgpath =
-                container_of(work, struct pgpath, deactivate_path);
-        blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue);
-}
 static struct priority_group *alloc_priority_group(void)
 {
        struct priority_group *pg;
@@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
                INIT_LIST_HEAD(&m->queued_ios);
                spin_lock_init(&m->lock);
                m->queue_io = 1;
+                m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
                INIT_WORK(&m->process_queued_ios, process_queued_ios);
                INIT_WORK(&m->trigger_event, trigger_event);
                init_waitqueue_head(&m->pg_init_wait);
@@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m)
 static void __pg_init_all_paths(struct multipath *m)
 {
        struct pgpath *pgpath;
+        unsigned long pg_init_delay = 0;
        m->pg_init_count++;
        m->pg_init_required = 0;
+        if (m->pg_init_delay_retry)
+                pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
+                                                 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
        list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
                /* Skip failed paths */
                if (!pgpath->is_active)
                        continue;
-                if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+                if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
+                                       pg_init_delay))
                        m->pg_init_in_progress++;
        }
 }
@@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m)
        const char *param_name;
        static struct param _params[] = {
-                {0, 3, "invalid number of feature args"},
+                {0, 5, "invalid number of feature args"},
                {1, 50, "pg_init_retries must be between 1 and 50"},
+                {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
        };
        r = read_param(_params, shift(as), &argc, &ti->error);
@@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m)
                        continue;
                }
+                if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
+                    (argc >= 1)) {
+                        r = read_param(_params + 2, shift(as),
+                                       &m->pg_init_delay_msecs, &ti->error);
+                        argc--;
+                        continue;
+                }
                ti->error = "Unrecognised multipath feature request";
                r = -EINVAL;
        } while (argc && !r);
@@ -833,8 +844,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 {
        /* target parameters */
        static struct param _params[] = {
-                {1, 1024, "invalid number of priority groups"},
+                {0, 1024, "invalid number of priority groups"},
-                {1, 1024, "invalid initial priority group number"},
+                {0, 1024, "invalid initial priority group number"},
        };
        int r;
@@ -868,6 +879,13 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
        if (r)
                goto bad;
+        if ((!m->nr_priority_groups && next_pg_num) ||
+            (m->nr_priority_groups && !next_pg_num)) {
+                ti->error = "invalid initial priority group";
+                r = -EINVAL;
+                goto bad;
+        }
        /* parse the priority groups */
        while (as.argc) {
                struct priority_group *pg;
@@ -931,7 +949,7 @@ static void flush_multipath_work(struct multipath *m)
        flush_workqueue(kmpath_handlerd);
        multipath_wait_for_pg_init_completion(m);
        flush_workqueue(kmultipathd);
-        flush_scheduled_work();
+        flush_work_sync(&m->trigger_event);
 }
 static void multipath_dtr(struct dm_target *ti)
@@ -995,7 +1013,6 @@ static int fail_path(struct pgpath *pgpath)
                      pgpath->path.dev->name, m->nr_valid_paths);
        schedule_work(&m->trigger_event);
-        queue_work(kmultipathd, &pgpath->deactivate_path);
 out:
        spin_unlock_irqrestore(&m->lock, flags);
@@ -1034,7 +1051,7 @@ static int reinstate_path(struct pgpath *pgpath)
                m->current_pgpath = NULL;
                queue_work(kmultipathd, &m->process_queued_ios);
        } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
-                if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+                if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
                        m->pg_init_in_progress++;
        }
@@ -1055,7 +1072,7 @@ out:
 static int action_dev(struct multipath *m, struct dm_dev *dev,
                      action_fn action)
 {
-        int r = 0;
+        int r = -EINVAL;
        struct pgpath *pgpath;
        struct priority_group *pg;
@@ -1169,6 +1186,7 @@ static void pg_init_done(void *data, int errors)
        struct priority_group *pg = pgpath->pg;
        struct multipath *m = pg->m;
        unsigned long flags;
+        unsigned delay_retry = 0;
        /* device or driver problems */
        switch (errors) {
@@ -1193,8 +1211,9 @@ static void pg_init_done(void *data, int errors)
                 */
                bypass_pg(m, pg, 1);
                break;
-        /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
        case SCSI_DH_RETRY:
+                /* Wait before retrying. */
+                delay_retry = 1;
        case SCSI_DH_IMM_RETRY:
        case SCSI_DH_RES_TEMP_UNAVAIL:
                if (pg_init_limit_reached(m, pgpath))
@@ -1227,6 +1246,7 @@ static void pg_init_done(void *data, int errors)
        if (!m->pg_init_required)
                m->queue_io = 0;
+        m->pg_init_delay_retry = delay_retry;
        queue_work(kmultipathd, &m->process_queued_ios);
        /*
@@ -1241,7 +1261,7 @@ out:
 static void activate_path(struct work_struct *work)
 {
        struct pgpath *pgpath =
-                container_of(work, struct pgpath, activate_path);
+                container_of(work, struct pgpath, activate_path.work);
        scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
                                pg_init_done, pgpath);
@@ -1270,24 +1290,22 @@ static int do_end_io(struct multipath *m, struct request *clone,
        if (!error && !clone->errors)
                return 0;       /* I/O complete */
-        if (error == -EOPNOTSUPP)
+        if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ)
-                return error;
-        if (clone->cmd_flags & REQ_DISCARD)
-                /*
-                 * Pass all discard request failures up.
-                 * FIXME: only fail_path if the discard failed due to a
-                 * transport problem.  This requires precise understanding
-                 * of the underlying failure (e.g. the SCSI sense).
-                 */
                return error;
        if (mpio->pgpath)
                fail_path(mpio->pgpath);
        spin_lock_irqsave(&m->lock, flags);
-        if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m))
+        if (!m->nr_valid_paths) {
-                r = -EIO;
+                if (!m->queue_if_no_path) {
+                        if (!__must_push_back(m))
+                                r = -EIO;
+                } else {
+                        if (error == -EBADE)
+                                r = error;
+                }
+        }
        spin_unlock_irqrestore(&m->lock, flags);
        return r;
@@ -1382,11 +1400,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
                DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
        else {
                DMEMIT("%u ", m->queue_if_no_path +
-                              (m->pg_init_retries > 0) * 2);
+                              (m->pg_init_retries > 0) * 2 +
+                              (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
                if (m->queue_if_no_path)
                        DMEMIT("queue_if_no_path ");
                if (m->pg_init_retries)
                        DMEMIT("pg_init_retries %u ", m->pg_init_retries);
+                if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
+                        DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
        }
        if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1401,7 +1422,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
        else if (m->current_pg)
                pg_num = m->current_pg->pg_num;
        else
-                        pg_num = 1;
+                pg_num = (m->nr_priority_groups ? 1 : 0);
        DMEMIT("%u ", pg_num);
@@ -1655,7 +1676,7 @@ out:
 *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
        .name = "multipath",
-        .version = {1, 1, 1},
+        .version = {1, 3, 0},
        .module = THIS_MODULE,
        .ctr = multipath_ctr,
        .dtr = multipath_dtr,
@@ -1687,7 +1708,7 @@ static int __init dm_multipath_init(void)
                return -EINVAL;
        }
-        kmultipathd = create_workqueue("kmpathd");
+        kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
        if (!kmultipathd) {
                DMERR("failed to create workqueue kmpathd");
                dm_unregister_target(&multipath_target);
@@ -1701,7 +1722,8 @@ static int __init dm_multipath_init(void)
         * old workqueue would also create a bottleneck in the
         * path of the storage hardware device activation.
         */
-        kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd");
+        kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
+                                                  WQ_MEM_RECLAIM);
        if (!kmpath_handlerd) {
                DMERR("failed to create workqueue kmpath_handlerd");
                destroy_workqueue(kmultipathd);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644
index 000000000000..e5d8904fc8f6
--- /dev/null
+++ b/drivers/md/dm-raid.c
@@ -0,0 +1,689 @@
+/*
+ * Copyright (C) 2010-2011 Neil Brown
+ * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/slab.h>
+#include "md.h"
+#include "raid5.h"
+#include "dm.h"
+#include "bitmap.h"
+#define DM_MSG_PREFIX "raid"
+/*
+ * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
+ * make it so the flag doesn't set anything.
+ */
+#ifndef MD_SYNC_STATE_FORCED
+#define MD_SYNC_STATE_FORCED 0
+#endif
+struct raid_dev {
+        /*
+         * Two DM devices, one to hold metadata and one to hold the
+         * actual data/parity.  The reason for this is to not confuse
+         * ti->len and give more flexibility in altering size and
+         * characteristics.
+         *
+         * While it is possible for this device to be associated
+         * with a different physical device than the data_dev, it
+         * is intended for it to be the same.
+         *    |--------- Physical Device ---------|
+         *    |- meta_dev -|------ data_dev ------|
+         */
+        struct dm_dev *meta_dev;
+        struct dm_dev *data_dev;
+        struct mdk_rdev_s rdev;
+};
+/*
+ * Flags for rs->print_flags field.
+ */
+#define DMPF_DAEMON_SLEEP      0x1
+#define DMPF_MAX_WRITE_BEHIND  0x2
+#define DMPF_SYNC              0x4
+#define DMPF_NOSYNC            0x8
+#define DMPF_STRIPE_CACHE      0x10
+#define DMPF_MIN_RECOVERY_RATE 0x20
+#define DMPF_MAX_RECOVERY_RATE 0x40
+struct raid_set {
+        struct dm_target *ti;
+        uint64_t print_flags;
+        struct mddev_s md;
+        struct raid_type *raid_type;
+        struct dm_target_callbacks callbacks;
+        struct raid_dev dev[0];
+};
+/* Supported raid types and properties. */
+static struct raid_type {
+        const char *name;               /* RAID algorithm. */
+        const char *descr;              /* Descriptor text for logging. */
+        const unsigned parity_devs;     /* # of parity devices. */
+        const unsigned minimal_devs;    /* minimal # of devices in set. */
+        const unsigned level;           /* RAID level. */
+        const unsigned algorithm;       /* RAID algorithm. */
+} raid_types[] = {
+        {"raid4",    "RAID4 (dedicated parity disk)",   1, 2, 5, ALGORITHM_PARITY_0},
+        {"raid5_la", "RAID5 (left asymmetric)",         1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
+        {"raid5_ra", "RAID5 (right asymmetric)",        1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
+        {"raid5_ls", "RAID5 (left symmetric)",          1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
+        {"raid5_rs", "RAID5 (right symmetric)",         1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
+        {"raid6_zr", "RAID6 (zero restart)",            2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
+        {"raid6_nr", "RAID6 (N restart)",               2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
+        {"raid6_nc", "RAID6 (N continue)",              2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
+};
+static struct raid_type *get_raid_type(char *name)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(raid_types); i++)
+                if (!strcmp(raid_types[i].name, name))
+                        return &raid_types[i];
+        return NULL;
+}
+static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
+{
+        unsigned i;
+        struct raid_set *rs;
+        sector_t sectors_per_dev;
+        if (raid_devs <= raid_type->parity_devs) {
+                ti->error = "Insufficient number of devices";
+                return ERR_PTR(-EINVAL);
+        }
+        sectors_per_dev = ti->len;
+        if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+                ti->error = "Target length not divisible by number of data devices";
+                return ERR_PTR(-EINVAL);
+        }
+        rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
+        if (!rs) {
+                ti->error = "Cannot allocate raid context";
+                return ERR_PTR(-ENOMEM);
+        }
+        mddev_init(&rs->md);
+        rs->ti = ti;
+        rs->raid_type = raid_type;
+        rs->md.raid_disks = raid_devs;
+        rs->md.level = raid_type->level;
+        rs->md.new_level = rs->md.level;
+        rs->md.dev_sectors = sectors_per_dev;
+        rs->md.layout = raid_type->algorithm;
+        rs->md.new_layout = rs->md.layout;
+        rs->md.delta_disks = 0;
+        rs->md.recovery_cp = 0;
+        for (i = 0; i < raid_devs; i++)
+                md_rdev_init(&rs->dev[i].rdev);
+        /*
+         * Remaining items to be initialized by further RAID params:
+         *  rs->md.persistent
+         *  rs->md.external
+         *  rs->md.chunk_sectors
+         *  rs->md.new_chunk_sectors
+         */
+        return rs;
+}
+static void context_free(struct raid_set *rs)
+{
+        int i;
+        for (i = 0; i < rs->md.raid_disks; i++)
+                if (rs->dev[i].data_dev)
+                        dm_put_device(rs->ti, rs->dev[i].data_dev);
+        kfree(rs);
+}
+/*
+ * For every device we have two words
+ *  <meta_dev>: meta device name or '-' if missing
+ *  <data_dev>: data device name or '-' if missing
+ *
+ * This code parses those words.
+ */
+static int dev_parms(struct raid_set *rs, char **argv)
+{
+        int i;
+        int rebuild = 0;
+        int metadata_available = 0;
+        int ret = 0;
+        for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
+                rs->dev[i].rdev.raid_disk = i;
+                rs->dev[i].meta_dev = NULL;
+                rs->dev[i].data_dev = NULL;
+                /*
+                 * There are no offsets, since there is a separate device
+                 * for data and metadata.
+                 */
+                rs->dev[i].rdev.data_offset = 0;
+                rs->dev[i].rdev.mddev = &rs->md;
+                if (strcmp(argv[0], "-")) {
+                        rs->ti->error = "Metadata devices not supported";
+                        return -EINVAL;
+                }
+                if (!strcmp(argv[1], "-")) {
+                        if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
+                            (!rs->dev[i].rdev.recovery_offset)) {
+                                rs->ti->error = "Drive designated for rebuild not specified";
+                                return -EINVAL;
+                        }
+                        continue;
+                }
+                ret = dm_get_device(rs->ti, argv[1],
+                                    dm_table_get_mode(rs->ti->table),
+                                    &rs->dev[i].data_dev);
+                if (ret) {
+                        rs->ti->error = "RAID device lookup failure";
+                        return ret;
+                }
+                rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
+                list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
+                if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+                        rebuild++;
+        }
+        if (metadata_available) {
+                rs->md.external = 0;
+                rs->md.persistent = 1;
+                rs->md.major_version = 2;
+        } else if (rebuild && !rs->md.recovery_cp) {
+                /*
+                 * Without metadata, we will not be able to tell if the array
+                 * is in-sync or not - we must assume it is not.  Therefore,
+                 * it is impossible to rebuild a drive.
+                 *
+                 * Even if there is metadata, the on-disk information may
+                 * indicate that the array is not in-sync and it will then
+                 * fail at that time.
+                 *
+                 * User could specify 'nosync' option if desperate.
+                 */
+                DMERR("Unable to rebuild drive while array is not in-sync");
+                rs->ti->error = "RAID device lookup failure";
+                return -EINVAL;
+        }
+        return 0;
+}
+/*
+ * Possible arguments are...
+ * RAID456:
+ *      <chunk_size> [optional_args]
+ *
+ * Optional args:
+ *    [[no]sync]                        Force or prevent recovery of the entire array
+ *    [rebuild <idx>]                   Rebuild the drive indicated by the index
+ *    [daemon_sleep <ms>]               Time between bitmap daemon work to clear bits
+ *    [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ *    [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ *    [max_write_behind <sectors>]      See '-write-behind=' (man mdadm)
+ *    [stripe_cache <sectors>]          Stripe cache size for higher RAIDs
+ */
+static int parse_raid_params(struct raid_set *rs, char **argv,
+                             unsigned num_raid_params)
+{
+        unsigned i, rebuild_cnt = 0;
+        unsigned long value;
+        char *key;
+        /*
+         * First, parse the in-order required arguments
+         */
+        if ((strict_strtoul(argv[0], 10, &value) < 0) ||
+            !is_power_of_2(value) || (value < 8)) {
+                rs->ti->error = "Bad chunk size";
+                return -EINVAL;
+        }
+        rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
+        argv++;
+        num_raid_params--;
+        /*
+         * Second, parse the unordered optional arguments
+         */
+        for (i = 0; i < rs->md.raid_disks; i++)
+                set_bit(In_sync, &rs->dev[i].rdev.flags);
+        for (i = 0; i < num_raid_params; i++) {
+                if (!strcmp(argv[i], "nosync")) {
+                        rs->md.recovery_cp = MaxSector;
+                        rs->print_flags |= DMPF_NOSYNC;
+                        rs->md.flags |= MD_SYNC_STATE_FORCED;
+                        continue;
+                }
+                if (!strcmp(argv[i], "sync")) {
+                        rs->md.recovery_cp = 0;
+                        rs->print_flags |= DMPF_SYNC;
+                        rs->md.flags |= MD_SYNC_STATE_FORCED;
+                        continue;
+                }
+                /* The rest of the optional arguments come in key/value pairs */
+                if ((i + 1) >= num_raid_params) {
+                        rs->ti->error = "Wrong number of raid parameters given";
+                        return -EINVAL;
+                }
+                key = argv[i++];
+                if (strict_strtoul(argv[i], 10, &value) < 0) {
+                        rs->ti->error = "Bad numerical argument given in raid params";
+                        return -EINVAL;
+                }
+                if (!strcmp(key, "rebuild")) {
+                        if (++rebuild_cnt > rs->raid_type->parity_devs) {
+                                rs->ti->error = "Too many rebuild drives given";
+                                return -EINVAL;
+                        }
+                        if (value > rs->md.raid_disks) {
+                                rs->ti->error = "Invalid rebuild index given";
+                                return -EINVAL;
+                        }
+                        clear_bit(In_sync, &rs->dev[value].rdev.flags);
+                        rs->dev[value].rdev.recovery_offset = 0;
+                } else if (!strcmp(key, "max_write_behind")) {
+                        rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+                        /*
+                         * In device-mapper, we specify things in sectors, but
+                         * MD records this value in kB
+                         */
+                        value /= 2;
+                        if (value > COUNTER_MAX) {
+                                rs->ti->error = "Max write-behind limit out of range";
+                                return -EINVAL;
+                        }
+                        rs->md.bitmap_info.max_write_behind = value;
+                } else if (!strcmp(key, "daemon_sleep")) {
+                        rs->print_flags |= DMPF_DAEMON_SLEEP;
+                        if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
+                                rs->ti->error = "daemon sleep period out of range";
+                                return -EINVAL;
+                        }
+                        rs->md.bitmap_info.daemon_sleep = value;
+                } else if (!strcmp(key, "stripe_cache")) {
+                        rs->print_flags |= DMPF_STRIPE_CACHE;
+                        /*
+                         * In device-mapper, we specify things in sectors, but
+                         * MD records this value in kB
+                         */
+                        value /= 2;
+                        if (rs->raid_type->level < 5) {
+                                rs->ti->error = "Inappropriate argument: stripe_cache";
+                                return -EINVAL;
+                        }
+                        if (raid5_set_cache_size(&rs->md, (int)value)) {
+                                rs->ti->error = "Bad stripe_cache size";
+                                return -EINVAL;
+                        }
+                } else if (!strcmp(key, "min_recovery_rate")) {
+                        rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+                        if (value > INT_MAX) {
+                                rs->ti->error = "min_recovery_rate out of range";
+                                return -EINVAL;
+                        }
+                        rs->md.sync_speed_min = (int)value;
+                } else if (!strcmp(key, "max_recovery_rate")) {
+                        rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+                        if (value > INT_MAX) {
+                                rs->ti->error = "max_recovery_rate out of range";
+                                return -EINVAL;
+                        }
+                        rs->md.sync_speed_max = (int)value;
+                } else {
+                        DMERR("Unable to parse RAID parameter: %s", key);
+                        rs->ti->error = "Unable to parse RAID parameters";
+                        return -EINVAL;
+                }
+        }
+        /* Assume there are no metadata devices until the drives are parsed */
+        rs->md.persistent = 0;
+        rs->md.external = 1;
+        return 0;
+}
+static void do_table_event(struct work_struct *ws)
+{
+        struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
+        dm_table_event(rs->ti->table);
+}
+static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
+{
+        struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+        return md_raid5_congested(&rs->md, bits);
+}
+/*
+ * Construct a RAID4/5/6 mapping:
+ * Args:
+ *      <raid_type> <#raid_params> <raid_params>                \
+ *      <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
+ *
+ * ** metadata devices are not supported yet, use '-' instead **
+ *
+ * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
+ * details on possible <raid_params>.
+ */
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+        int ret;
+        struct raid_type *rt;
+        unsigned long num_raid_params, num_raid_devs;
+        struct raid_set *rs = NULL;
+        /* Must have at least <raid_type> <#raid_params> */
+        if (argc < 2) {
+                ti->error = "Too few arguments";
+                return -EINVAL;
+        }
+        /* raid type */
+        rt = get_raid_type(argv[0]);
+        if (!rt) {
+                ti->error = "Unrecognised raid_type";
+                return -EINVAL;
+        }
+        argc--;
+        argv++;
+        /* number of RAID parameters */
+        if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
+                ti->error = "Cannot understand number of RAID parameters";
+                return -EINVAL;
+        }
+        argc--;
+        argv++;
+        /* Skip over RAID params for now and find out # of devices */
+        if (num_raid_params + 1 > argc) {
+                ti->error = "Arguments do not agree with counts given";
+                return -EINVAL;
+        }
+        if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
+            (num_raid_devs >= INT_MAX)) {
+                ti->error = "Cannot understand number of raid devices";
+                return -EINVAL;
+        }
+        rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
+        if (IS_ERR(rs))
+                return PTR_ERR(rs);
+        ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
+        if (ret)
+                goto bad;
+        ret = -EINVAL;
+        argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+        argv += num_raid_params + 1;
+        if (argc != (num_raid_devs * 2)) {
+                ti->error = "Supplied RAID devices does not match the count given";
+                goto bad;
+        }
+        ret = dev_parms(rs, argv);
+        if (ret)
+                goto bad;
+        INIT_WORK(&rs->md.event_work, do_table_event);
+        ti->split_io = rs->md.chunk_sectors;
+        ti->private = rs;
+        mutex_lock(&rs->md.reconfig_mutex);
+        ret = md_run(&rs->md);
+        rs->md.in_sync = 0; /* Assume already marked dirty */
+        mutex_unlock(&rs->md.reconfig_mutex);
+        if (ret) {
+                ti->error = "Fail to run raid array";
+                goto bad;
+        }
+        rs->callbacks.congested_fn = raid_is_congested;
+        dm_table_add_target_callbacks(ti->table, &rs->callbacks);
+        return 0;
+bad:
+        context_free(rs);
+        return ret;
+}
+static void raid_dtr(struct dm_target *ti)
+{
+        struct raid_set *rs = ti->private;
+        list_del_init(&rs->callbacks.list);
+        md_stop(&rs->md);
+        context_free(rs);
+}
+static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
+{
+        struct raid_set *rs = ti->private;
+        mddev_t *mddev = &rs->md;
+        mddev->pers->make_request(mddev, bio);
+        return DM_MAPIO_SUBMITTED;
+}
+static int raid_status(struct dm_target *ti, status_type_t type,
+                       char *result, unsigned maxlen)
+{
+        struct raid_set *rs = ti->private;
+        unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
+        unsigned sz = 0;
+        int i;
+        sector_t sync;
+        switch (type) {
+        case STATUSTYPE_INFO:
+                DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
+                for (i = 0; i < rs->md.raid_disks; i++) {
+                        if (test_bit(Faulty, &rs->dev[i].rdev.flags))
+                                DMEMIT("D");
+                        else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                DMEMIT("A");
+                        else
+                                DMEMIT("a");
+                }
+                if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+                        sync = rs->md.curr_resync_completed;
+                else
+                        sync = rs->md.recovery_cp;
+                if (sync > rs->md.resync_max_sectors)
+                        sync = rs->md.resync_max_sectors;
+                DMEMIT(" %llu/%llu",
+                       (unsigned long long) sync,
+                       (unsigned long long) rs->md.resync_max_sectors);
+                break;
+        case STATUSTYPE_TABLE:
+                /* The string you would use to construct this array */
+                for (i = 0; i < rs->md.raid_disks; i++)
+                        if (rs->dev[i].data_dev &&
+                            !test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                raid_param_cnt++; /* for rebuilds */
+                raid_param_cnt += (hweight64(rs->print_flags) * 2);
+                if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+                        raid_param_cnt--;
+                DMEMIT("%s %u %u", rs->raid_type->name,
+                       raid_param_cnt, rs->md.chunk_sectors);
+                if ((rs->print_flags & DMPF_SYNC) &&
+                    (rs->md.recovery_cp == MaxSector))
+                        DMEMIT(" sync");
+                if (rs->print_flags & DMPF_NOSYNC)
+                        DMEMIT(" nosync");
+                for (i = 0; i < rs->md.raid_disks; i++)
+                        if (rs->dev[i].data_dev &&
+                            !test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                DMEMIT(" rebuild %u", i);
+                if (rs->print_flags & DMPF_DAEMON_SLEEP)
+                        DMEMIT(" daemon_sleep %lu",
+                               rs->md.bitmap_info.daemon_sleep);
+                if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+                        DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
+                if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+                        DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
+                if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+                        DMEMIT(" max_write_behind %lu",
+                               rs->md.bitmap_info.max_write_behind);
+                if (rs->print_flags & DMPF_STRIPE_CACHE) {
+                        raid5_conf_t *conf = rs->md.private;
+                        /* convert from kiB to sectors */
+                        DMEMIT(" stripe_cache %d",
+                               conf ? conf->max_nr_stripes * 2 : 0);
+                }
+                DMEMIT(" %d", rs->md.raid_disks);
+                for (i = 0; i < rs->md.raid_disks; i++) {
+                        DMEMIT(" -"); /* metadata device */
+                        if (rs->dev[i].data_dev)
+                                DMEMIT(" %s", rs->dev[i].data_dev->name);
+                        else
+                                DMEMIT(" -");
+                }
+        }
+        return 0;
+}
+static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+        struct raid_set *rs = ti->private;
+        unsigned i;
+        int ret = 0;
+        for (i = 0; !ret && i < rs->md.raid_disks; i++)
+                if (rs->dev[i].data_dev)
+                        ret = fn(ti,
+                                 rs->dev[i].data_dev,
+                                 0, /* No offset on data devs */
+                                 rs->md.dev_sectors,
+                                 data);
+        return ret;
+}
+static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+        struct raid_set *rs = ti->private;
+        unsigned chunk_size = rs->md.chunk_sectors << 9;
+        raid5_conf_t *conf = rs->md.private;
+        blk_limits_io_min(limits, chunk_size);
+        blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
+}
+static void raid_presuspend(struct dm_target *ti)
+{
+        struct raid_set *rs = ti->private;
+        md_stop_writes(&rs->md);
+}
+static void raid_postsuspend(struct dm_target *ti)
+{
+        struct raid_set *rs = ti->private;
+        mddev_suspend(&rs->md);
+}
+static void raid_resume(struct dm_target *ti)
+{
+        struct raid_set *rs = ti->private;
+        mddev_resume(&rs->md);
+}
+static struct target_type raid_target = {
+        .name = "raid",
+        .version = {1, 0, 0},
+        .module = THIS_MODULE,
+        .ctr = raid_ctr,
+        .dtr = raid_dtr,
+        .map = raid_map,
+        .status = raid_status,
+        .iterate_devices = raid_iterate_devices,
+        .io_hints = raid_io_hints,
+        .presuspend = raid_presuspend,
+        .postsuspend = raid_postsuspend,
+        .resume = raid_resume,
+};
+static int __init dm_raid_init(void)
+{
+        return dm_register_target(&raid_target);
+}
+static void __exit dm_raid_exit(void)
+{
+        dm_unregister_target(&raid_target);
+}
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
+MODULE_ALIAS("dm-raid6");
+MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 7c081bcbc3cf..9bfd057be686 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -22,8 +22,6 @@
 #define DM_MSG_PREFIX "raid1"
 #define MAX_RECOVERY 1  /* Maximum number of regions recovered in parallel. */
-#define DM_IO_PAGES 64
-#define DM_KCOPYD_PAGES 64
 #define DM_RAID1_HANDLE_ERRORS 0x01
 #define errors_handled(p)       ((p)->features & DM_RAID1_HANDLE_ERRORS)
@@ -259,9 +257,9 @@ static int mirror_flush(struct dm_target *ti)
        struct dm_io_region io[ms->nr_mirrors];
        struct mirror *m;
        struct dm_io_request io_req = {
-                .bi_rw = WRITE_BARRIER,
+                .bi_rw = WRITE_FLUSH,
                .mem.type = DM_IO_KMEM,
-                .mem.ptr.bvec = NULL,
+                .mem.ptr.addr = NULL,
                .client = ms->io_client,
        };
@@ -629,7 +627,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
        struct dm_io_region io[ms->nr_mirrors], *dest = io;
        struct mirror *m;
        struct dm_io_request io_req = {
-                .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
+                .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
                .mem.type = DM_IO_BVEC,
                .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
                .notify.fn = write_callback,
@@ -637,6 +635,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
                .client = ms->io_client,
        };
+        if (bio->bi_rw & REQ_DISCARD) {
+                io_req.bi_rw |= REQ_DISCARD;
+                io_req.mem.type = DM_IO_KMEM;
+                io_req.mem.ptr.addr = NULL;
+        }
        for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
                map_region(dest++, m, bio);
@@ -670,7 +674,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
        bio_list_init(&requeue);
        while ((bio = bio_list_pop(writes))) {
-                if (unlikely(bio_empty_barrier(bio))) {
+                if ((bio->bi_rw & REQ_FLUSH) ||
+                    (bio->bi_rw & REQ_DISCARD)) {
                        bio_list_add(&sync, bio);
                        continue;
                }
@@ -835,8 +840,6 @@ static void do_mirror(struct work_struct *work)
        do_reads(ms, &reads);
        do_writes(ms, &writes);
        do_failures(ms, &failures);
-        dm_table_unplug_all(ms->ti->table);
 }
 /*-----------------------------------------------------------------
@@ -882,7 +885,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
                return NULL;
        }
-        ms->io_client = dm_io_client_create(DM_IO_PAGES);
+        ms->io_client = dm_io_client_create();
        if (IS_ERR(ms->io_client)) {
                ti->error = "Error creating dm_io client";
                mempool_destroy(ms->read_record_pool);
@@ -1076,8 +1079,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->private = ms;
        ti->split_io = dm_rh_get_region_size(ms->rh);
        ti->num_flush_requests = 1;
+        ti->num_discard_requests = 1;
-        ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
+        ms->kmirrord_wq = alloc_workqueue("kmirrord",
+                                          WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
        if (!ms->kmirrord_wq) {
                DMERR("couldn't start kmirrord");
                r = -ENOMEM;
@@ -1110,9 +1115,11 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto err_destroy_wq;
        }
-        r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client);
+        ms->kcopyd_client = dm_kcopyd_client_create();
-        if (r)
+        if (IS_ERR(ms->kcopyd_client)) {
+                r = PTR_ERR(ms->kcopyd_client);
                goto err_destroy_wq;
+        }
        wakeup_mirrord(ms);
        return 0;
@@ -1130,7 +1137,7 @@ static void mirror_dtr(struct dm_target *ti)
        del_timer_sync(&ms->timer);
        flush_workqueue(ms->kmirrord_wq);
-        flush_scheduled_work();
+        flush_work_sync(&ms->trigger_event);
        dm_kcopyd_client_destroy(ms->kcopyd_client);
        destroy_workqueue(ms->kmirrord_wq);
        free_context(ms, ti, ms->nr_mirrors);
@@ -1203,7 +1210,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
         * We need to dec pending if this was a write.
         */
        if (rw == WRITE) {
-                if (likely(!bio_empty_barrier(bio)))
+                if (!(bio->bi_rw & REQ_FLUSH))
                        dm_rh_dec(ms->rh, map_context->ll);
                return error;
        }
@@ -1406,7 +1413,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 static struct target_type mirror_target = {
        .name    = "mirror",
-        .version = {1, 12, 0},
+        .version = {1, 12, 1},
        .module  = THIS_MODULE,
        .ctr     = mirror_ctr,
        .dtr     = mirror_dtr,
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index bd5c58b28868..7771ed212182 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -81,9 +81,9 @@ struct dm_region_hash {
        struct list_head failed_recovered_regions;
        /*
-         * If there was a barrier failure no regions can be marked clean.
+         * If there was a flush failure no regions can be marked clean.
         */
-        int barrier_failure;
+        int flush_failure;
        void *context;
        sector_t target_begin;
@@ -217,7 +217,7 @@ struct dm_region_hash *dm_region_hash_create(
        INIT_LIST_HEAD(&rh->quiesced_regions);
        INIT_LIST_HEAD(&rh->recovered_regions);
        INIT_LIST_HEAD(&rh->failed_recovered_regions);
-        rh->barrier_failure = 0;
+        rh->flush_failure = 0;
        rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
                                                      sizeof(struct dm_region));
@@ -399,8 +399,8 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
        region_t region = dm_rh_bio_to_region(rh, bio);
        int recovering = 0;
-        if (bio_empty_barrier(bio)) {
+        if (bio->bi_rw & REQ_FLUSH) {
-                rh->barrier_failure = 1;
+                rh->flush_failure = 1;
                return;
        }
@@ -419,7 +419,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
        /*
         * Possible cases:
         *   1) DM_RH_DIRTY
-         *   2) DM_RH_NOSYNC: was dirty, other preceeding writes failed
+         *   2) DM_RH_NOSYNC: was dirty, other preceding writes failed
         *   3) DM_RH_RECOVERING: flushing pending writes
         * Either case, the region should have not been connected to list.
         */
@@ -524,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
        struct bio *bio;
        for (bio = bios->head; bio; bio = bio->bi_next) {
-                if (bio_empty_barrier(bio))
+                if (bio->bi_rw & REQ_FLUSH)
                        continue;
                rh_inc(rh, dm_rh_bio_to_region(rh, bio));
        }
@@ -555,9 +555,9 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
                 */
                /* do nothing for DM_RH_NOSYNC */
-                if (unlikely(rh->barrier_failure)) {
+                if (unlikely(rh->flush_failure)) {
                        /*
-                         * If a write barrier failed some time ago, we
+                         * If a write flush failed some time ago, we
                         * don't know whether or not this write made it
                         * to the disk, so we must resync the device.
                         */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index cc2bdb83f9ad..135c2f1fdbfc 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -154,11 +154,6 @@ struct pstore {
        struct workqueue_struct *metadata_wq;
 };
-static unsigned sectors_to_pages(unsigned sectors)
-{
-        return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
-}
 static int alloc_area(struct pstore *ps)
 {
        int r = -ENOMEM;
@@ -254,9 +249,9 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
         * Issue the synchronous I/O from a different thread
         * to avoid generic_make_request recursion.
         */
-        INIT_WORK_ON_STACK(&req.work, do_metadata);
+        INIT_WORK_ONSTACK(&req.work, do_metadata);
        queue_work(ps->metadata_wq, &req.work);
-        flush_workqueue(ps->metadata_wq);
+        flush_work(&req.work);
        return req.result;
 }
@@ -318,8 +313,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
                chunk_size_supplied = 0;
        }
-        ps->io_client = dm_io_client_create(sectors_to_pages(ps->store->
+        ps->io_client = dm_io_client_create();
-                                                             chunk_size));
        if (IS_ERR(ps->io_client))
                return PTR_ERR(ps->io_client);
@@ -368,11 +362,6 @@ static int read_header(struct pstore *ps, int *new_snapshot)
                return r;
        }
-        r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size),
-                                ps->io_client);
-        if (r)
-                return r;
        r = alloc_area(ps);
        return r;
@@ -687,7 +676,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
        /*
         * Commit exceptions to disk.
         */
-        if (ps->valid && area_io(ps, WRITE_BARRIER))
+        if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
                ps->valid = 0;
        /*
@@ -818,7 +807,7 @@ static int persistent_ctr(struct dm_exception_store *store,
        atomic_set(&ps->pending_count, 0);
        ps->callbacks = NULL;
-        ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
+        ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
        if (!ps->metadata_wq) {
                kfree(ps);
                DMERR("couldn't start header metadata update thread");
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 5974d3094d97..9ecff5f3023a 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,7 +19,6 @@
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 #include <linux/dm-kcopyd.h>
-#include <linux/workqueue.h>
 #include "dm-exception-store.h"
@@ -41,11 +40,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
 #define SNAPSHOT_COPY_PRIORITY 2
 /*
- * Reserve 1MB for each snapshot initially (with minimum of 1 page).
- */
-#define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1)
-/*
 * The size of the mempool used to track chunks in use.
 */
 #define MIN_IOS 256
@@ -80,9 +74,6 @@ struct dm_snapshot {
        /* Origin writes don't trigger exceptions until this is set */
        int active;
-        /* Whether or not owning mapped_device is suspended */
-        int suspended;
        atomic_t pending_exceptions_count;
        mempool_t *pending_pool;
@@ -106,10 +97,6 @@ struct dm_snapshot {
        struct dm_kcopyd_client *kcopyd_client;
-        /* Queue of snapshot writes for ksnapd to flush */
-        struct bio_list queued_bios;
-        struct work_struct queued_bios_work;
        /* Wait for events based on state_bits */
        unsigned long state_bits;
@@ -160,9 +147,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
 }
 EXPORT_SYMBOL(dm_snap_cow);
-static struct workqueue_struct *ksnapd;
-static void flush_queued_bios(struct work_struct *work);
 static sector_t chunk_to_sector(struct dm_exception_store *store,
                                chunk_t chunk)
 {
@@ -706,8 +690,6 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
        return 0;
 }
-#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
 /*
 * Return a minimum chunk size of all snapshots that have the specified origin.
 * Return zero if the origin has no snapshots.
@@ -1093,7 +1075,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        argv++;
        argc--;
-        r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow);
+        r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
        if (r) {
                ti->error = "Cannot get COW device";
                goto bad_cow;
@@ -1112,7 +1094,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        s->ti = ti;
        s->valid = 1;
        s->active = 0;
-        s->suspended = 0;
        atomic_set(&s->pending_exceptions_count, 0);
        init_rwsem(&s->lock);
        INIT_LIST_HEAD(&s->list);
@@ -1130,8 +1111,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad_hash_tables;
        }
-        r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
+        s->kcopyd_client = dm_kcopyd_client_create();
-        if (r) {
+        if (IS_ERR(s->kcopyd_client)) {
+                r = PTR_ERR(s->kcopyd_client);
                ti->error = "Could not create kcopyd client";
                goto bad_kcopyd;
        }
@@ -1155,9 +1137,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        spin_lock_init(&s->tracked_chunk_lock);
-        bio_list_init(&s->queued_bios);
-        INIT_WORK(&s->queued_bios_work, flush_queued_bios);
        ti->private = s;
        ti->num_flush_requests = num_flush_requests;
@@ -1281,8 +1260,6 @@ static void snapshot_dtr(struct dm_target *ti)
        struct dm_snapshot *s = ti->private;
        struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
-        flush_workqueue(ksnapd);
        down_read(&_origins_lock);
        /* Check whether exception handover must be cancelled */
        (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
@@ -1344,20 +1321,6 @@ static void flush_bios(struct bio *bio)
        }
 }
-static void flush_queued_bios(struct work_struct *work)
-{
-        struct dm_snapshot *s =
-                container_of(work, struct dm_snapshot, queued_bios_work);
-        struct bio *queued_bios;
-        unsigned long flags;
-        spin_lock_irqsave(&s->pe_lock, flags);
-        queued_bios = bio_list_get(&s->queued_bios);
-        spin_unlock_irqrestore(&s->pe_lock, flags);
-        flush_bios(queued_bios);
-}
 static int do_origin(struct dm_dev *origin, struct bio *bio);
 /*
@@ -1587,7 +1550,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
        chunk_t chunk;
        struct dm_snap_pending_exception *pe = NULL;
-        if (unlikely(bio_empty_barrier(bio))) {
+        if (bio->bi_rw & REQ_FLUSH) {
                bio->bi_bdev = s->cow->bdev;
                return DM_MAPIO_REMAPPED;
        }
@@ -1691,7 +1654,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
        int r = DM_MAPIO_REMAPPED;
        chunk_t chunk;
-        if (unlikely(bio_empty_barrier(bio))) {
+        if (bio->bi_rw & REQ_FLUSH) {
                if (!map_context->target_request_nr)
                        bio->bi_bdev = s->origin->bdev;
                else
@@ -1762,15 +1725,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti)
        stop_merge(s);
 }
-static void snapshot_postsuspend(struct dm_target *ti)
-{
-        struct dm_snapshot *s = ti->private;
-        down_write(&s->lock);
-        s->suspended = 1;
-        up_write(&s->lock);
-}
 static int snapshot_preresume(struct dm_target *ti)
 {
        int r = 0;
@@ -1785,7 +1739,7 @@ static int snapshot_preresume(struct dm_target *ti)
                        DMERR("Unable to resume snapshot source until "
                              "handover completes.");
                        r = -EINVAL;
-                } else if (!snap_src->suspended) {
+                } else if (!dm_suspended(snap_src->ti)) {
                        DMERR("Unable to perform snapshot handover until "
                              "source is suspended.");
                        r = -EINVAL;
@@ -1818,7 +1772,6 @@ static void snapshot_resume(struct dm_target *ti)
        down_write(&s->lock);
        s->active = 1;
-        s->suspended = 0;
        up_write(&s->lock);
 }
@@ -2135,7 +2088,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
        struct dm_dev *dev = ti->private;
        bio->bi_bdev = dev->bdev;
-        if (unlikely(bio_empty_barrier(bio)))
+        if (bio->bi_rw & REQ_FLUSH)
                return DM_MAPIO_REMAPPED;
        /* Only tell snapshots if this is a write */
@@ -2196,7 +2149,7 @@ static int origin_iterate_devices(struct dm_target *ti,
 static struct target_type origin_target = {
        .name    = "snapshot-origin",
-        .version = {1, 7, 0},
+        .version = {1, 7, 1},
        .module  = THIS_MODULE,
        .ctr     = origin_ctr,
        .dtr     = origin_dtr,
@@ -2209,13 +2162,12 @@ static struct target_type origin_target = {
 static struct target_type snapshot_target = {
        .name    = "snapshot",
-        .version = {1, 9, 0},
+        .version = {1, 10, 0},
        .module  = THIS_MODULE,
        .ctr     = snapshot_ctr,
        .dtr     = snapshot_dtr,
        .map     = snapshot_map,
        .end_io  = snapshot_end_io,
-        .postsuspend = snapshot_postsuspend,
        .preresume  = snapshot_preresume,
        .resume  = snapshot_resume,
        .status  = snapshot_status,
@@ -2224,14 +2176,13 @@ static struct target_type snapshot_target = {
 static struct target_type merge_target = {
        .name    = dm_snapshot_merge_target_name,
-        .version = {1, 0, 0},
+        .version = {1, 1, 0},
        .module  = THIS_MODULE,
        .ctr     = snapshot_ctr,
        .dtr     = snapshot_dtr,
        .map     = snapshot_merge_map,
        .end_io  = snapshot_end_io,
        .presuspend = snapshot_merge_presuspend,
-        .postsuspend = snapshot_postsuspend,
        .preresume  = snapshot_preresume,
        .resume  = snapshot_merge_resume,
        .status  = snapshot_status,
@@ -2293,17 +2244,8 @@ static int __init dm_snapshot_init(void)
                goto bad_tracked_chunk_cache;
        }
-        ksnapd = create_singlethread_workqueue("ksnapd");
-        if (!ksnapd) {
-                DMERR("Failed to create ksnapd workqueue.");
-                r = -ENOMEM;
-                goto bad_pending_pool;
-        }
        return 0;
-bad_pending_pool:
-        kmem_cache_destroy(tracked_chunk_cache);
 bad_tracked_chunk_cache:
        kmem_cache_destroy(pending_cache);
 bad_pending_cache:
@@ -2324,8 +2266,6 @@ bad_register_snapshot_target:
 static void __exit dm_snapshot_exit(void)
 {
-        destroy_workqueue(ksnapd);
        dm_unregister_target(&snapshot_target);
        dm_unregister_target(&origin_target);
        dm_unregister_target(&merge_target);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c297f6da91ea..3d80cf0c152d 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -39,23 +39,20 @@ struct stripe_c {
        struct dm_target *ti;
        /* Work struct used for triggering events*/
-        struct work_struct kstriped_ws;
+        struct work_struct trigger_event;
        struct stripe stripe[0];
 };
-static struct workqueue_struct *kstriped;
 /*
 * An event is triggered whenever a drive
 * drops out of a stripe volume.
 */
 static void trigger_event(struct work_struct *work)
 {
-        struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws);
+        struct stripe_c *sc = container_of(work, struct stripe_c,
+                                           trigger_event);
        dm_table_event(sc->ti->table);
 }
 static inline struct stripe_c *alloc_context(unsigned int stripes)
@@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                return -ENOMEM;
        }
-        INIT_WORK(&sc->kstriped_ws, trigger_event);
+        INIT_WORK(&sc->trigger_event, trigger_event);
        /* Set pointer to dm target; used in trigger_event */
        sc->ti = ti;
@@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti)
        for (i = 0; i < sc->stripes; i++)
                dm_put_device(ti, sc->stripe[i].dev);
-        flush_workqueue(kstriped);
+        flush_work_sync(&sc->trigger_event);
        kfree(sc);
 }
@@ -271,7 +268,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
        uint32_t stripe;
        unsigned target_request_nr;
-        if (unlikely(bio_empty_barrier(bio))) {
+        if (bio->bi_rw & REQ_FLUSH) {
                target_request_nr = map_context->target_request_nr;
                BUG_ON(target_request_nr >= sc->stripes);
                bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
@@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
                        atomic_inc(&(sc->stripe[i].error_count));
                        if (atomic_read(&(sc->stripe[i].error_count)) <
                            DM_IO_ERROR_THRESHOLD)
-                                queue_work(kstriped, &sc->kstriped_ws);
+                                schedule_work(&sc->trigger_event);
                }
        return error;
@@ -399,9 +396,29 @@ static void stripe_io_hints(struct dm_target *ti,
        blk_limits_io_opt(limits, chunk_size * sc->stripes);
 }
+static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+                        struct bio_vec *biovec, int max_size)
+{
+        struct stripe_c *sc = ti->private;
+        sector_t bvm_sector = bvm->bi_sector;
+        uint32_t stripe;
+        struct request_queue *q;
+        stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector);
+        q = bdev_get_queue(sc->stripe[stripe].dev->bdev);
+        if (!q->merge_bvec_fn)
+                return max_size;
+        bvm->bi_bdev = sc->stripe[stripe].dev->bdev;
+        bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector;
+        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
 static struct target_type stripe_target = {
        .name   = "striped",
-        .version = {1, 3, 0},
+        .version = {1, 4, 0},
        .module = THIS_MODULE,
        .ctr    = stripe_ctr,
        .dtr    = stripe_dtr,
@@ -410,6 +427,7 @@ static struct target_type stripe_target = {
        .status = stripe_status,
        .iterate_devices = stripe_iterate_devices,
        .io_hints = stripe_io_hints,
+        .merge  = stripe_merge,
 };
 int __init dm_stripe_init(void)
@@ -422,20 +440,10 @@ int __init dm_stripe_init(void)
                return r;
        }
-        kstriped = create_singlethread_workqueue("kstriped");
-        if (!kstriped) {
-                DMERR("failed to create workqueue kstriped");
-                dm_unregister_target(&stripe_target);
-                return -ENOMEM;
-        }
        return r;
 }
 void dm_stripe_exit(void)
 {
        dm_unregister_target(&stripe_target);
-        destroy_workqueue(kstriped);
-        return;
 }
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f9fc07d7a4b9..451c3bb176d2 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -55,6 +55,7 @@ struct dm_table {
        struct dm_target *targets;
        unsigned discards_supported:1;
+        unsigned integrity_supported:1;
        /*
         * Indicates the rw permissions for the new logical
@@ -71,6 +72,8 @@ struct dm_table {
        void *event_context;
        struct dm_md_mempools *mempools;
+        struct list_head target_callbacks;
 };
 /*
@@ -204,6 +207,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
                return -ENOMEM;
        INIT_LIST_HEAD(&t->devices);
+        INIT_LIST_HEAD(&t->target_callbacks);
        atomic_set(&t->holders, 0);
        t->discards_supported = 1;
@@ -325,15 +329,18 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev,
        BUG_ON(d->dm_dev.bdev);
-        bdev = open_by_devnum(dev, d->dm_dev.mode);
+        bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
-        r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md));
-        if (r)
+        r = bd_link_disk_holder(bdev, dm_disk(md));
-                blkdev_put(bdev, d->dm_dev.mode);
+        if (r) {
-        else
+                blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL);
-                d->dm_dev.bdev = bdev;
+                return r;
-        return r;
+        }
+        d->dm_dev.bdev = bdev;
+        return 0;
 }
 /*
@@ -344,8 +351,8 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
        if (!d->dm_dev.bdev)
                return;
-        bd_release_from_disk(d->dm_dev.bdev, dm_disk(md));
+        bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md));
-        blkdev_put(d->dm_dev.bdev, d->dm_dev.mode);
+        blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL);
        d->dm_dev.bdev = NULL;
 }
@@ -355,6 +362,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
                                  sector_t start, sector_t len, void *data)
 {
+        struct request_queue *q;
        struct queue_limits *limits = data;
        struct block_device *bdev = dev->bdev;
        sector_t dev_size =
@@ -363,6 +371,22 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
                limits->logical_block_size >> SECTOR_SHIFT;
        char b[BDEVNAME_SIZE];
+        /*
+         * Some devices exist without request functions,
+         * such as loop devices not yet bound to backing files.
+         * Forbid the use of such devices.
+         */
+        q = bdev_get_queue(bdev);
+        if (!q || !q->make_request_fn) {
+                DMWARN("%s: %s is not yet initialised: "
+                       "start=%llu, len=%llu, dev_size=%llu",
+                       dm_device_name(ti->table->md), bdevname(bdev, b),
+                       (unsigned long long)start,
+                       (unsigned long long)len,
+                       (unsigned long long)dev_size);
+                return 1;
+        }
        if (!dev_size)
                return 0;
@@ -486,11 +510,6 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
        return 0;
 }
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
                         sector_t start, sector_t len, void *data)
 {
@@ -522,9 +541,8 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
         */
        if (q->merge_bvec_fn && !ti->type->merge)
-                limits->max_sectors =
+                blk_limits_max_hw_sectors(limits,
-                        min_not_zero(limits->max_sectors,
+                                          (unsigned int) (PAGE_SIZE >> 9));
-                                     (unsigned int) (PAGE_SIZE >> 9));
        return 0;
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
@@ -859,7 +877,7 @@ int dm_table_alloc_md_mempools(struct dm_table *t)
                return -EINVAL;
        }
-        t->mempools = dm_alloc_md_mempools(type);
+        t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
        if (!t->mempools)
                return -ENOMEM;
@@ -926,18 +944,80 @@ static int dm_table_build_index(struct dm_table *t)
 }
 /*
+ * Get a disk whose integrity profile reflects the table's profile.
+ * If %match_all is true, all devices' profiles must match.
+ * If %match_all is false, all devices must at least have an
+ * allocated integrity profile; but uninitialized is ok.
+ * Returns NULL if integrity support was inconsistent or unavailable.
+ */
+static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
+                                                    bool match_all)
+{
+        struct list_head *devices = dm_table_get_devices(t);
+        struct dm_dev_internal *dd = NULL;
+        struct gendisk *prev_disk = NULL, *template_disk = NULL;
+        list_for_each_entry(dd, devices, list) {
+                template_disk = dd->dm_dev.bdev->bd_disk;
+                if (!blk_get_integrity(template_disk))
+                        goto no_integrity;
+                if (!match_all && !blk_integrity_is_initialized(template_disk))
+                        continue; /* skip uninitialized profiles */
+                else if (prev_disk &&
+                         blk_integrity_compare(prev_disk, template_disk) < 0)
+                        goto no_integrity;
+                prev_disk = template_disk;
+        }
+        return template_disk;
+no_integrity:
+        if (prev_disk)
+                DMWARN("%s: integrity not set: %s and %s profile mismatch",
+                       dm_device_name(t->md),
+                       prev_disk->disk_name,
+                       template_disk->disk_name);
+        return NULL;
+}
+/*
 * Register the mapped device for blk_integrity support if
- * the underlying devices support it.
+ * the underlying devices have an integrity profile.  But all devices
+ * may not have matching profiles (checking all devices isn't reliable
+ * during table load because this table may use other DM device(s) which
+ * must be resumed before they will have an initialized integity profile).
+ * Stacked DM devices force a 2 stage integrity profile validation:
+ * 1 - during load, validate all initialized integrity profiles match
+ * 2 - during resume, validate all integrity profiles match
 */
 static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
 {
-        struct list_head *devices = dm_table_get_devices(t);
+        struct gendisk *template_disk = NULL;
-        struct dm_dev_internal *dd;
+        template_disk = dm_table_get_integrity_disk(t, false);
+        if (!template_disk)
+                return 0;
-        list_for_each_entry(dd, devices, list)
+        if (!blk_integrity_is_initialized(dm_disk(md))) {
-                if (bdev_get_integrity(dd->dm_dev.bdev))
+                t->integrity_supported = 1;
-                        return blk_integrity_register(dm_disk(md), NULL);
+                return blk_integrity_register(dm_disk(md), NULL);
+        }
+        /*
+         * If DM device already has an initalized integrity
+         * profile the new profile should not conflict.
+         */
+        if (blk_integrity_is_initialized(template_disk) &&
+            blk_integrity_compare(dm_disk(md), template_disk) < 0) {
+                DMWARN("%s: conflict with existing integrity profile: "
+                       "%s profile mismatch",
+                       dm_device_name(t->md),
+                       template_disk->disk_name);
+                return 1;
+        }
+        /* Preserve existing initialized integrity profile */
+        t->integrity_supported = 1;
        return 0;
 }
@@ -1091,41 +1171,27 @@ combine_limits:
 /*
 * Set the integrity profile for this device if all devices used have
- * matching profiles.
+ * matching profiles.  We're quite deep in the resume path but still
+ * don't know if all devices (particularly DM devices this device
+ * may be stacked on) have matching profiles.  Even if the profiles
+ * don't match we have no way to fail (to resume) at this point.
 */
 static void dm_table_set_integrity(struct dm_table *t)
 {
-        struct list_head *devices = dm_table_get_devices(t);
+        struct gendisk *template_disk = NULL;
-        struct dm_dev_internal *prev = NULL, *dd = NULL;
        if (!blk_get_integrity(dm_disk(t->md)))
                return;
-        list_for_each_entry(dd, devices, list) {
+        template_disk = dm_table_get_integrity_disk(t, true);
-                if (prev &&
+        if (!template_disk &&
-                    blk_integrity_compare(prev->dm_dev.bdev->bd_disk,
+            blk_integrity_is_initialized(dm_disk(t->md))) {
-                                          dd->dm_dev.bdev->bd_disk) < 0) {
+                DMWARN("%s: device no longer has a valid integrity profile",
-                        DMWARN("%s: integrity not set: %s and %s mismatch",
+                       dm_device_name(t->md));
-                               dm_device_name(t->md),
+                return;
-                               prev->dm_dev.bdev->bd_disk->disk_name,
-                               dd->dm_dev.bdev->bd_disk->disk_name);
-                        goto no_integrity;
-                }
-                prev = dd;
        }
-        if (!prev || !bdev_get_integrity(prev->dm_dev.bdev))
-                goto no_integrity;
        blk_integrity_register(dm_disk(t->md),
-                               bdev_get_integrity(prev->dm_dev.bdev));
+                               blk_get_integrity(template_disk));
-        return;
-no_integrity:
-        blk_integrity_register(dm_disk(t->md), NULL);
-        return;
 }
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -1136,11 +1202,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
         */
        q->limits = *limits;
-        if (limits->no_cluster)
-                queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
-        else
-                queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
        if (!dm_table_supports_discards(t))
                queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
        else
@@ -1234,10 +1295,17 @@ int dm_table_resume_targets(struct dm_table *t)
        return 0;
 }
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
+{
+        list_add(&cb->list, &t->target_callbacks);
+}
+EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 {
        struct dm_dev_internal *dd;
        struct list_head *devices = dm_table_get_devices(t);
+        struct dm_target_callbacks *cb;
        int r = 0;
        list_for_each_entry(dd, devices, list) {
@@ -1252,6 +1320,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
                                     bdevname(dd->dm_dev.bdev, b));
        }
+        list_for_each_entry(cb, &t->target_callbacks, list)
+                if (cb->congested_fn)
+                        r |= cb->congested_fn(cb, bdi_bits);
        return r;
 }
@@ -1269,24 +1341,6 @@ int dm_table_any_busy_target(struct dm_table *t)
        return 0;
 }
-void dm_table_unplug_all(struct dm_table *t)
-{
-        struct dm_dev_internal *dd;
-        struct list_head *devices = dm_table_get_devices(t);
-        list_for_each_entry(dd, devices, list) {
-                struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
-                char b[BDEVNAME_SIZE];
-                if (likely(q))
-                        blk_unplug(q);
-                else
-                        DMWARN_LIMIT("%s: Cannot unplug nonexistent device %s",
-                                     dm_device_name(t->md),
-                                     bdevname(dd->dm_dev.bdev, b));
-        }
-}
 struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
        return t->md;
@@ -1309,7 +1363,8 @@ bool dm_table_supports_discards(struct dm_table *t)
                return 0;
        /*
-         * Ensure that at least one underlying device supports discards.
+         * Unless any target used by the table set discards_supported,
+         * require at least one underlying device to support discards.
         * t->devices includes internal dm devices such as mirror logs
         * so we need to use iterate_devices here, which targets
         * supporting discard must provide.
@@ -1317,6 +1372,9 @@ bool dm_table_supports_discards(struct dm_table *t)
        while (i < dm_table_get_num_targets(t)) {
                ti = dm_table_get_target(t, i++);
+                if (ti->discards_supported)
+                        return 1;
                if (ti->type->iterate_devices &&
                    ti->type->iterate_devices(ti, device_discard_capable, NULL))
                        return 1;
@@ -1334,4 +1392,3 @@ EXPORT_SYMBOL(dm_table_get_mode);
 EXPORT_SYMBOL(dm_table_get_md);
 EXPORT_SYMBOL(dm_table_put);
 EXPORT_SYMBOL(dm_table_get);
-EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ac384b2a6a33..0cf68b478878 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -15,7 +15,6 @@
 #include <linux/blkpg.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
@@ -110,7 +109,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_FREEING 3
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
-#define DMF_QUEUE_IO_TO_THREAD 6
 /*
 * Work processed by per-device workqueue.
@@ -144,24 +142,9 @@ struct mapped_device {
        spinlock_t deferred_lock;
        /*
-         * An error from the barrier request currently being processed.
+         * Processing queue (flush)
-         */
-        int barrier_error;
-        /*
-         * Protect barrier_error from concurrent endio processing
-         * in request-based dm.
-         */
-        spinlock_t barrier_error_lock;
-        /*
-         * Processing queue (flush/barriers)
         */
        struct workqueue_struct *wq;
-        struct work_struct barrier_work;
-        /* A pointer to the currently processing pre/post flush request */
-        struct request *flush_request;
        /*
         * The current mapping.
@@ -200,8 +183,8 @@ struct mapped_device {
        /* sysfs handle */
        struct kobject kobj;
-        /* zero-length barrier that will be cloned and submitted to targets */
+        /* zero-length flush that will be cloned and submitted to targets */
-        struct bio barrier_bio;
+        struct bio flush_bio;
 };
 /*
@@ -344,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 {
        struct mapped_device *md;
-        lock_kernel();
        spin_lock(&_minor_lock);
        md = bdev->bd_disk->private_data;
@@ -362,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 out:
        spin_unlock(&_minor_lock);
-        unlock_kernel();
        return md ? 0 : -ENXIO;
 }
@@ -371,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
 {
        struct mapped_device *md = disk->private_data;
-        lock_kernel();
+        spin_lock(&_minor_lock);
        atomic_dec(&md->open_count);
        dm_put(md);
-        unlock_kernel();
+        spin_unlock(&_minor_lock);
        return 0;
 }
@@ -494,7 +477,8 @@ static void start_io_acct(struct dm_io *io)
        cpu = part_stat_lock();
        part_round_stats(cpu, &dm_disk(md)->part0);
        part_stat_unlock();
-        dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]);
+        atomic_set(&dm_disk(md)->part0.in_flight[rw],
+                atomic_inc_return(&md->pending[rw]));
 }
 static void end_io_acct(struct dm_io *io)
@@ -512,10 +496,10 @@ static void end_io_acct(struct dm_io *io)
        /*
         * After this is decremented the bio must not be touched if it is
-         * a barrier.
+         * a flush.
         */
-        dm_disk(md)->part0.in_flight[rw] = pending =
+        pending = atomic_dec_return(&md->pending[rw]);
-                atomic_dec_return(&md->pending[rw]);
+        atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
        pending += atomic_read(&md->pending[rw^0x1]);
        /* nudge anyone waiting on suspend queue */
@@ -528,16 +512,12 @@ static void end_io_acct(struct dm_io *io)
 */
 static void queue_io(struct mapped_device *md, struct bio *bio)
 {
-        down_write(&md->io_lock);
+        unsigned long flags;
-        spin_lock_irq(&md->deferred_lock);
+        spin_lock_irqsave(&md->deferred_lock, flags);
        bio_list_add(&md->deferred, bio);
-        spin_unlock_irq(&md->deferred_lock);
+        spin_unlock_irqrestore(&md->deferred_lock, flags);
+        queue_work(md->wq, &md->work);
-        if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
-                queue_work(md->wq, &md->work);
-        up_write(&md->io_lock);
 }
 /*
@@ -625,11 +605,9 @@ static void dec_pending(struct dm_io *io, int error)
                         * Target requested pushing back the I/O.
                         */
                        spin_lock_irqsave(&md->deferred_lock, flags);
-                        if (__noflush_suspending(md)) {
+                        if (__noflush_suspending(md))
-                                if (!(io->bio->bi_rw & REQ_HARDBARRIER))
+                                bio_list_add_head(&md->deferred, io->bio);
-                                        bio_list_add_head(&md->deferred,
+                        else
-                                                          io->bio);
-                        } else
                                /* noflush suspend was interrupted. */
                                io->error = -EIO;
                        spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -637,32 +615,23 @@ static void dec_pending(struct dm_io *io, int error)
                io_error = io->error;
                bio = io->bio;
+                end_io_acct(io);
+                free_io(md, io);
+                if (io_error == DM_ENDIO_REQUEUE)
+                        return;
-                if (bio->bi_rw & REQ_HARDBARRIER) {
+                if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
                        /*
-                         * There can be just one barrier request so we use
+                         * Preflush done for flush with data, reissue
-                         * a per-device variable for error reporting.
+                         * without REQ_FLUSH.
-                         * Note that you can't touch the bio after end_io_acct
-                         *
-                         * We ignore -EOPNOTSUPP for empty flush reported by
-                         * underlying devices. We assume that if the device
-                         * doesn't support empty barriers, it doesn't need
-                         * cache flushing commands.
                         */
-                        if (!md->barrier_error &&
+                        bio->bi_rw &= ~REQ_FLUSH;
-                            !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))
+                        queue_io(md, bio);
-                                md->barrier_error = io_error;
-                        end_io_acct(io);
-                        free_io(md, io);
                } else {
-                        end_io_acct(io);
+                        /* done with normal IO or empty flush */
-                        free_io(md, io);
+                        trace_block_bio_complete(md->queue, bio, io_error);
+                        bio_endio(bio, io_error);
-                        if (io_error != DM_ENDIO_REQUEUE) {
-                                trace_block_bio_complete(md->queue, bio);
-                                bio_endio(bio, io_error);
-                        }
                }
        }
 }
@@ -755,23 +724,6 @@ static void end_clone_bio(struct bio *clone, int error)
        blk_update_request(tio->orig, 0, nr_bytes);
 }
-static void store_barrier_error(struct mapped_device *md, int error)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&md->barrier_error_lock, flags);
-        /*
-         * Basically, the first error is taken, but:
-         *   -EOPNOTSUPP supersedes any I/O error.
-         *   Requeue request supersedes any I/O error but -EOPNOTSUPP.
-         */
-        if (!md->barrier_error || error == -EOPNOTSUPP ||
-            (md->barrier_error != -EOPNOTSUPP &&
-             error == DM_ENDIO_REQUEUE))
-                md->barrier_error = error;
-        spin_unlock_irqrestore(&md->barrier_error_lock, flags);
-}
 /*
 * Don't touch any member of the md after calling this function because
 * the md may be freed in dm_put() at the end of this function.
@@ -809,13 +761,11 @@ static void free_rq_clone(struct request *clone)
 static void dm_end_request(struct request *clone, int error)
 {
        int rw = rq_data_dir(clone);
-        int run_queue = 1;
-        bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct mapped_device *md = tio->md;
        struct request *rq = tio->orig;
-        if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) {
+        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
                rq->errors = clone->errors;
                rq->resid_len = clone->resid_len;
@@ -829,15 +779,8 @@ static void dm_end_request(struct request *clone, int error)
        }
        free_rq_clone(clone);
+        blk_end_request_all(rq, error);
-        if (unlikely(is_barrier)) {
+        rq_completed(md, rw, true);
-                if (unlikely(error))
-                        store_barrier_error(md, error);
-                run_queue = 0;
-        } else
-                blk_end_request_all(rq, error);
-        rq_completed(md, rw, run_queue);
 }
 static void dm_unprep_request(struct request *rq)
@@ -862,21 +805,9 @@ void dm_requeue_unmapped_request(struct request *clone)
        struct request_queue *q = rq->q;
        unsigned long flags;
-        if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-                /*
-                 * Barrier clones share an original request.
-                 * Leave it to dm_end_request(), which handles this special
-                 * case.
-                 */
-                dm_end_request(clone, DM_ENDIO_REQUEUE);
-                return;
-        }
        dm_unprep_request(rq);
        spin_lock_irqsave(q->queue_lock, flags);
-        if (elv_queue_empty(q))
-                blk_plug_device(q);
        blk_requeue_request(q, rq);
        spin_unlock_irqrestore(q->queue_lock, flags);
@@ -961,19 +892,6 @@ static void dm_complete_request(struct request *clone, int error)
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct request *rq = tio->orig;
-        if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-                /*
-                 * Barrier clones share an original request.  So can't use
-                 * softirq_done with the original.
-                 * Pass the clone to dm_done() directly in this special case.
-                 * It is safe (even if clone->q->queue_lock is held here)
-                 * because there is no I/O dispatching during the completion
-                 * of barrier clone.
-                 */
-                dm_done(clone, error, true);
-                return;
-        }
        tio->error = error;
        rq->completion_data = clone;
        blk_complete_request(rq);
@@ -990,17 +908,6 @@ void dm_kill_unmapped_request(struct request *clone, int error)
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct request *rq = tio->orig;
-        if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-                /*
-                 * Barrier clones share an original request.
-                 * Leave it to dm_end_request(), which handles this special
-                 * case.
-                 */
-                BUG_ON(error > 0);
-                dm_end_request(clone, error);
-                return;
-        }
        rq->cmd_flags |= REQ_FAILED;
        dm_complete_request(clone, error);
 }
@@ -1081,8 +988,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
        if (r == DM_MAPIO_REMAPPED) {
                /* the bio has been remapped so dispatch it */
-                trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
+                trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
-                                    tio->io->bio->bi_bdev->bd_dev, sector);
+                                      tio->io->bio->bi_bdev->bd_dev, sector);
                generic_make_request(clone);
        } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
@@ -1119,7 +1026,7 @@ static void dm_bio_destructor(struct bio *bio)
 }
 /*
- * Creates a little bio that is just does part of a bvec.
+ * Creates a little bio that just does part of a bvec.
 */
 static struct bio *split_bvec(struct bio *bio, sector_t sector,
                              unsigned short idx, unsigned int offset,
@@ -1134,7 +1041,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
        clone->bi_sector = sector;
        clone->bi_bdev = bio->bi_bdev;
-        clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
+        clone->bi_rw = bio->bi_rw;
        clone->bi_vcnt = 1;
        clone->bi_size = to_bytes(len);
        clone->bi_io_vec->bv_offset = offset;
@@ -1161,7 +1068,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
        clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
        __bio_clone(clone, bio);
-        clone->bi_rw &= ~REQ_HARDBARRIER;
        clone->bi_destructor = dm_bio_destructor;
        clone->bi_sector = sector;
        clone->bi_idx = idx;
@@ -1225,16 +1131,15 @@ static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
                __issue_target_request(ci, ti, request_nr, len);
 }
-static int __clone_and_map_empty_barrier(struct clone_info *ci)
+static int __clone_and_map_empty_flush(struct clone_info *ci)
 {
        unsigned target_nr = 0;
        struct dm_target *ti;
+        BUG_ON(bio_has_data(ci->bio));
        while ((ti = dm_table_get_target(ci->map, target_nr++)))
                __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
-        ci->sector_count = 0;
        return 0;
 }
@@ -1289,9 +1194,6 @@ static int __clone_and_map(struct clone_info *ci)
        sector_t len = 0, max;
        struct dm_target_io *tio;
-        if (unlikely(bio_empty_barrier(bio)))
-                return __clone_and_map_empty_barrier(ci);
        if (unlikely(bio->bi_rw & REQ_DISCARD))
                return __clone_and_map_discard(ci);
@@ -1383,16 +1285,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
        ci.map = dm_get_live_table(md);
        if (unlikely(!ci.map)) {
-                if (!(bio->bi_rw & REQ_HARDBARRIER))
+                bio_io_error(bio);
-                        bio_io_error(bio);
-                else
-                        if (!md->barrier_error)
-                                md->barrier_error = -EIO;
                return;
        }
        ci.md = md;
-        ci.bio = bio;
        ci.io = alloc_io(md);
        ci.io->error = 0;
        atomic_set(&ci.io->io_count, 1);
@@ -1400,14 +1297,20 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
        ci.io->md = md;
        spin_lock_init(&ci.io->endio_lock);
        ci.sector = bio->bi_sector;
-        ci.sector_count = bio_sectors(bio);
-        if (unlikely(bio_empty_barrier(bio)))
-                ci.sector_count = 1;
        ci.idx = bio->bi_idx;
        start_io_acct(ci.io);
-        while (ci.sector_count && !error)
+        if (bio->bi_rw & REQ_FLUSH) {
-                error = __clone_and_map(&ci);
+                ci.bio = &ci.md->flush_bio;
+                ci.sector_count = 0;
+                error = __clone_and_map_empty_flush(&ci);
+                /* dec_pending submits any data associated with flush */
+        } else {
+                ci.bio = bio;
+                ci.sector_count = bio_sectors(bio);
+                while (ci.sector_count && !error)
+                        error = __clone_and_map(&ci);
+        }
        /* drop the extra reference count */
        dec_pending(ci.io, error);
@@ -1491,22 +1394,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
        part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
        part_stat_unlock();
-        /*
+        /* if we're suspended, we have to queue this io for later */
-         * If we're suspended or the thread is processing barriers
+        if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
-         * we have to queue this io for later.
-         */
-        if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-            unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
                up_read(&md->io_lock);
-                if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
+                if (bio_rw(bio) != READA)
-                    bio_rw(bio) == READA) {
+                        queue_io(md, bio);
+                else
                        bio_io_error(bio);
-                        return 0;
-                }
-                queue_io(md, bio);
                return 0;
        }
@@ -1537,14 +1432,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
        return _dm_request(q, bio);
 }
-static bool dm_rq_is_flush_request(struct request *rq)
-{
-        if (rq->cmd_flags & REQ_FLUSH)
-                return true;
-        else
-                return false;
-}
 void dm_dispatch_request(struct request *rq)
 {
        int r;
@@ -1592,22 +1479,15 @@ static int setup_clone(struct request *clone, struct request *rq,
 {
        int r;
-        if (dm_rq_is_flush_request(rq)) {
+        r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
-                blk_rq_init(NULL, clone);
+                              dm_rq_bio_constructor, tio);
-                clone->cmd_type = REQ_TYPE_FS;
+        if (r)
-                clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
+                return r;
-        } else {
-                r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
-                                      dm_rq_bio_constructor, tio);
-                if (r)
-                        return r;
-                clone->cmd = rq->cmd;
-                clone->cmd_len = rq->cmd_len;
-                clone->sense = rq->sense;
-                clone->buffer = rq->buffer;
-        }
+        clone->cmd = rq->cmd;
+        clone->cmd_len = rq->cmd_len;
+        clone->sense = rq->sense;
+        clone->buffer = rq->buffer;
        clone->end_io = end_clone_request;
        clone->end_io_data = tio;
@@ -1648,9 +1528,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
        struct mapped_device *md = q->queuedata;
        struct request *clone;
-        if (unlikely(dm_rq_is_flush_request(rq)))
-                return BLKPREP_OK;
        if (unlikely(rq->special)) {
                DMWARN("Already has something in rq->special.");
                return BLKPREP_KILL;
@@ -1727,6 +1604,7 @@ static void dm_request_fn(struct request_queue *q)
        struct dm_table *map = dm_get_live_table(md);
        struct dm_target *ti;
        struct request *rq, *clone;
+        sector_t pos;
        /*
         * For suspend, check blk_queue_stopped() and increment
@@ -1734,22 +1612,21 @@ static void dm_request_fn(struct request_queue *q)
         * number of in-flight I/Os after the queue is stopped in
         * dm_suspend().
         */
-        while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
+        while (!blk_queue_stopped(q)) {
                rq = blk_peek_request(q);
                if (!rq)
-                        goto plug_and_out;
+                        goto delay_and_out;
-                if (unlikely(dm_rq_is_flush_request(rq))) {
+                /* always use block 0 to find the target for flushes for now */
-                        BUG_ON(md->flush_request);
+                pos = 0;
-                        md->flush_request = rq;
+                if (!(rq->cmd_flags & REQ_FLUSH))
-                        blk_start_request(rq);
+                        pos = blk_rq_pos(rq);
-                        queue_work(md->wq, &md->barrier_work);
-                        goto out;
+                ti = dm_table_find_target(map, pos);
-                }
+                BUG_ON(!dm_target_is_valid(ti));
-                ti = dm_table_find_target(map, blk_rq_pos(rq));
                if (ti->type->busy && ti->type->busy(ti))
-                        goto plug_and_out;
+                        goto delay_and_out;
                blk_start_request(rq);
                clone = rq->special;
@@ -1759,19 +1636,18 @@ static void dm_request_fn(struct request_queue *q)
                if (map_request(ti, clone, md))
                        goto requeued;
-                spin_lock_irq(q->queue_lock);
+                BUG_ON(!irqs_disabled());
+                spin_lock(q->queue_lock);
        }
        goto out;
 requeued:
-        spin_lock_irq(q->queue_lock);
+        BUG_ON(!irqs_disabled());
+        spin_lock(q->queue_lock);
-plug_and_out:
-        if (!elv_queue_empty(q))
-                /* Some requests still remain, retry later */
-                blk_plug_device(q);
+delay_and_out:
+        blk_delay_queue(q, HZ / 10);
 out:
        dm_table_put(map);
@@ -1800,20 +1676,6 @@ static int dm_lld_busy(struct request_queue *q)
        return r;
 }
-static void dm_unplug_all(struct request_queue *q)
-{
-        struct mapped_device *md = q->queuedata;
-        struct dm_table *map = dm_get_live_table(md);
-        if (map) {
-                if (dm_request_based(md))
-                        generic_unplug_device(q);
-                dm_table_unplug_all(map);
-                dm_table_put(map);
-        }
-}
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
        int r = bdi_bits;
@@ -1918,7 +1780,6 @@ out:
 static const struct block_device_operations dm_blk_dops;
 static void dm_wq_work(struct work_struct *work);
-static void dm_rq_barrier_work(struct work_struct *work);
 static void dm_init_md_queue(struct mapped_device *md)
 {
@@ -1938,8 +1799,8 @@ static void dm_init_md_queue(struct mapped_device *md)
        md->queue->backing_dev_info.congested_data = md;
        blk_queue_make_request(md->queue, dm_request);
        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
-        md->queue->unplug_fn = dm_unplug_all;
        blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+        blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 }
 /*
@@ -1972,7 +1833,6 @@ static struct mapped_device *alloc_dev(int minor)
        mutex_init(&md->suspend_lock);
        mutex_init(&md->type_lock);
        spin_lock_init(&md->deferred_lock);
-        spin_lock_init(&md->barrier_error_lock);
        rwlock_init(&md->map_lock);
        atomic_set(&md->holders, 1);
        atomic_set(&md->open_count, 0);
@@ -1995,7 +1855,6 @@ static struct mapped_device *alloc_dev(int minor)
        atomic_set(&md->pending[1], 0);
        init_waitqueue_head(&md->wait);
        INIT_WORK(&md->work, dm_wq_work);
-        INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
        init_waitqueue_head(&md->eventq);
        md->disk->major = _major;
@@ -2007,7 +1866,8 @@ static struct mapped_device *alloc_dev(int minor)
        add_disk(md->disk);
        format_dev_t(md->name, MKDEV(_major, minor));
-        md->wq = create_singlethread_workqueue("kdmflush");
+        md->wq = alloc_workqueue("kdmflush",
+                                 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
        if (!md->wq)
                goto bad_thread;
@@ -2015,6 +1875,10 @@ static struct mapped_device *alloc_dev(int minor)
        if (!md->bdev)
                goto bad_bdev;
+        bio_init(&md->flush_bio);
+        md->flush_bio.bi_bdev = md->bdev;
+        md->flush_bio.bi_rw = WRITE_FLUSH;
        /* Populate the mapping, nobody knows we exist yet */
        spin_lock(&_minor_lock);
        old_md = idr_replace(&_minor_idr, md, minor);
@@ -2111,13 +1975,14 @@ static void event_callback(void *context)
        wake_up(&md->eventq);
 }
+/*
+ * Protected by md->suspend_lock obtained by dm_swap_table().
+ */
 static void __set_size(struct mapped_device *md, sector_t size)
 {
        set_capacity(md->disk, size);
-        mutex_lock(&md->bdev->bd_inode->i_mutex);
        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-        mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 /*
@@ -2245,7 +2110,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)
        blk_queue_softirq_done(md->queue, dm_softirq_done);
        blk_queue_prep_rq(md->queue, dm_prep_fn);
        blk_queue_lld_busy(md->queue, dm_lld_busy);
-        blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
        elv_register_queue(md->queue);
@@ -2380,8 +2244,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
        int r = 0;
        DECLARE_WAITQUEUE(wait, current);
-        dm_unplug_all(md->queue);
        add_wait_queue(&md->wait, &wait);
        while (1) {
@@ -2406,43 +2268,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
        return r;
 }
-static void dm_flush(struct mapped_device *md)
-{
-        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-        bio_init(&md->barrier_bio);
-        md->barrier_bio.bi_bdev = md->bdev;
-        md->barrier_bio.bi_rw = WRITE_BARRIER;
-        __split_and_process_bio(md, &md->barrier_bio);
-        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-}
-static void process_barrier(struct mapped_device *md, struct bio *bio)
-{
-        md->barrier_error = 0;
-        dm_flush(md);
-        if (!bio_empty_barrier(bio)) {
-                __split_and_process_bio(md, bio);
-                /*
-                 * If the request isn't supported, don't waste time with
-                 * the second flush.
-                 */
-                if (md->barrier_error != -EOPNOTSUPP)
-                        dm_flush(md);
-        }
-        if (md->barrier_error != DM_ENDIO_REQUEUE)
-                bio_endio(bio, md->barrier_error);
-        else {
-                spin_lock_irq(&md->deferred_lock);
-                bio_list_add_head(&md->deferred, bio);
-                spin_unlock_irq(&md->deferred_lock);
-        }
-}
 /*
 * Process the deferred bios
 */
@@ -2452,33 +2277,27 @@ static void dm_wq_work(struct work_struct *work)
                                                work);
        struct bio *c;
-        down_write(&md->io_lock);
+        down_read(&md->io_lock);
        while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
                spin_lock_irq(&md->deferred_lock);
                c = bio_list_pop(&md->deferred);
                spin_unlock_irq(&md->deferred_lock);
-                if (!c) {
+                if (!c)
-                        clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
                        break;
-                }
-                up_write(&md->io_lock);
+                up_read(&md->io_lock);
                if (dm_request_based(md))
                        generic_make_request(c);
-                else {
+                else
-                        if (c->bi_rw & REQ_HARDBARRIER)
+                        __split_and_process_bio(md, c);
-                                process_barrier(md, c);
-                        else
-                                __split_and_process_bio(md, c);
-                }
-                down_write(&md->io_lock);
+                down_read(&md->io_lock);
        }
-        up_write(&md->io_lock);
+        up_read(&md->io_lock);
 }
 static void dm_queue_flush(struct mapped_device *md)
@@ -2488,73 +2307,6 @@ static void dm_queue_flush(struct mapped_device *md)
        queue_work(md->wq, &md->work);
 }
-static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
-{
-        struct dm_rq_target_io *tio = clone->end_io_data;
-        tio->info.target_request_nr = request_nr;
-}
-/* Issue barrier requests to targets and wait for their completion. */
-static int dm_rq_barrier(struct mapped_device *md)
-{
-        int i, j;
-        struct dm_table *map = dm_get_live_table(md);
-        unsigned num_targets = dm_table_get_num_targets(map);
-        struct dm_target *ti;
-        struct request *clone;
-        md->barrier_error = 0;
-        for (i = 0; i < num_targets; i++) {
-                ti = dm_table_get_target(map, i);
-                for (j = 0; j < ti->num_flush_requests; j++) {
-                        clone = clone_rq(md->flush_request, md, GFP_NOIO);
-                        dm_rq_set_target_request_nr(clone, j);
-                        atomic_inc(&md->pending[rq_data_dir(clone)]);
-                        map_request(ti, clone, md);
-                }
-        }
-        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-        dm_table_put(map);
-        return md->barrier_error;
-}
-static void dm_rq_barrier_work(struct work_struct *work)
-{
-        int error;
-        struct mapped_device *md = container_of(work, struct mapped_device,
-                                                barrier_work);
-        struct request_queue *q = md->queue;
-        struct request *rq;
-        unsigned long flags;
-        /*
-         * Hold the md reference here and leave it at the last part so that
-         * the md can't be deleted by device opener when the barrier request
-         * completes.
-         */
-        dm_get(md);
-        error = dm_rq_barrier(md);
-        rq = md->flush_request;
-        md->flush_request = NULL;
-        if (error == DM_ENDIO_REQUEUE) {
-                spin_lock_irqsave(q->queue_lock, flags);
-                blk_requeue_request(q, rq);
-                spin_unlock_irqrestore(q->queue_lock, flags);
-        } else
-                blk_end_request_all(rq, error);
-        blk_run_queue(q);
-        dm_put(md);
-}
 /*
 * Swap in a new table, returning the old one for the caller to destroy.
 */
@@ -2677,23 +2429,17 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
         *
         * To get all processes out of __split_and_process_bio in dm_request,
         * we take the write lock. To prevent any process from reentering
-         * __split_and_process_bio from dm_request, we set
+         * __split_and_process_bio from dm_request and quiesce the thread
-         * DMF_QUEUE_IO_TO_THREAD.
+         * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
-         *
+         * flush_workqueue(md->wq).
-         * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
-         * and call flush_workqueue(md->wq). flush_workqueue will wait until
-         * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
-         * further calls to __split_and_process_bio from dm_wq_work.
         */
        down_write(&md->io_lock);
        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
-        set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
        up_write(&md->io_lock);
        /*
-         * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
+         * Stop md->queue before flushing md->wq in case request-based
-         * can be kicked until md->queue is stopped.  So stop md->queue before
+         * dm defers requests to md->wq from md->queue.
-         * flushing md->wq.
         */
        if (dm_request_based(md))
                stop_queue(md->queue);
@@ -2772,7 +2518,6 @@ int dm_resume(struct mapped_device *md)
        clear_bit(DMF_SUSPENDED, &md->flags);
-        dm_table_unplug_all(map);
        r = 0;
 out:
        dm_table_put(map);
@@ -2876,9 +2621,10 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
 {
        struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
+        unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
        if (!pools)
                return NULL;
@@ -2895,13 +2641,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
        if (!pools->tio_pool)
                goto free_io_pool_and_out;
-        pools->bs = (type == DM_TYPE_BIO_BASED) ?
+        pools->bs = bioset_create(pool_size, 0);
-                    bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
        if (!pools->bs)
                goto free_tio_pool_and_out;
+        if (integrity && bioset_integrity_create(pools->bs, pool_size))
+                goto free_bioset_and_out;
        return pools;
+free_bioset_and_out:
+        bioset_free(pools->bs);
 free_tio_pool_and_out:
        mempool_destroy(pools->tio_pool);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0c2dd5f4af76..1aaf16746da8 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -149,7 +149,7 @@ void dm_kcopyd_exit(void);
 /*
 * Mempool operations
 */
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 #endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 1a8987884614..23078dabb6df 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -30,7 +30,7 @@
 *
 * Different modes can be active at a time, but only
 * one can be set at array creation.  Others can be added later.
- * A mode can be one-shot or recurrent with the recurrance being
+ * A mode can be one-shot or recurrent with the recurrence being
 * once in every N requests.
 * The bottom 5 bits of the "layout" indicate the mode.  The
 * remainder indicate a period, or 0 for one-shot.
@@ -210,7 +210,7 @@ static int make_request(mddev_t *mddev, struct bio *bio)
                }
        }
        if (failit) {
-                struct bio *b = bio_clone(bio, GFP_NOIO);
+                struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
                b->bi_bdev = conf->rdev->bdev;
                b->bi_private = bio;
                b->bi_end_io = faulty_fail;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ba19060bcf3f..abfb59a61ede 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -87,22 +87,6 @@ static int linear_mergeable_bvec(struct request_queue *q,
        return maxsectors << 9;
 }
-static void linear_unplug(struct request_queue *q)
-{
-        mddev_t *mddev = q->queuedata;
-        linear_conf_t *conf;
-        int i;
-        rcu_read_lock();
-        conf = rcu_dereference(mddev->private);
-        for (i=0; i < mddev->raid_disks; i++) {
-                struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
-                blk_unplug(r_queue);
-        }
-        rcu_read_unlock();
-}
 static int linear_congested(void *data, int bits)
 {
        mddev_t *mddev = data;
@@ -216,7 +200,6 @@ static int linear_run (mddev_t *mddev)
        if (md_check_no_bitmap(mddev))
                return -EINVAL;
-        mddev->queue->queue_lock = &mddev->queue->__queue_lock;
        conf = linear_conf(mddev, mddev->raid_disks);
        if (!conf)
@@ -225,11 +208,9 @@ static int linear_run (mddev_t *mddev)
        md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
        blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
-        mddev->queue->unplug_fn = linear_unplug;
        mddev->queue->backing_dev_info.congested_fn = linear_congested;
        mddev->queue->backing_dev_info.congested_data = mddev;
-        md_integrity_register(mddev);
+        return md_integrity_register(mddev);
-        return 0;
 }
 static void free_conf(struct rcu_head *head)
@@ -294,8 +275,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
        dev_info_t *tmp_dev;
        sector_t start_sector;
-        if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-                md_barrier_request(mddev, bio);
+                md_flush_request(mddev, bio);
                return 0;
        }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f20d13e717d5..91e31e260b4a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -36,7 +36,7 @@
 #include <linux/blkdev.h>
 #include <linux/sysctl.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/poll.h>
 #include <linux/ctype.h>
@@ -57,7 +57,6 @@
 #define DEBUG 0
 #define dprintk(x...) ((void)(DEBUG && printk(x)))
 #ifndef MODULE
 static void autostart_arrays(int part);
 #endif
@@ -68,6 +67,8 @@ static DEFINE_SPINLOCK(pers_lock);
 static void md_print_devices(void);
 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+static struct workqueue_struct *md_wq;
+static struct workqueue_struct *md_misc_wq;
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
@@ -148,6 +149,72 @@ static const struct block_device_operations md_fops;
 static int start_readonly;
+/* bio_clone_mddev
+ * like bio_clone, but with a local bio set
+ */
+static void mddev_bio_destructor(struct bio *bio)
+{
+        mddev_t *mddev, **mddevp;
+        mddevp = (void*)bio;
+        mddev = mddevp[-1];
+        bio_free(bio, mddev->bio_set);
+}
+struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
+                            mddev_t *mddev)
+{
+        struct bio *b;
+        mddev_t **mddevp;
+        if (!mddev || !mddev->bio_set)
+                return bio_alloc(gfp_mask, nr_iovecs);
+        b = bio_alloc_bioset(gfp_mask, nr_iovecs,
+                             mddev->bio_set);
+        if (!b)
+                return NULL;
+        mddevp = (void*)b;
+        mddevp[-1] = mddev;
+        b->bi_destructor = mddev_bio_destructor;
+        return b;
+}
+EXPORT_SYMBOL_GPL(bio_alloc_mddev);
+struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
+                            mddev_t *mddev)
+{
+        struct bio *b;
+        mddev_t **mddevp;
+        if (!mddev || !mddev->bio_set)
+                return bio_clone(bio, gfp_mask);
+        b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
+                             mddev->bio_set);
+        if (!b)
+                return NULL;
+        mddevp = (void*)b;
+        mddevp[-1] = mddev;
+        b->bi_destructor = mddev_bio_destructor;
+        __bio_clone(b, bio);
+        if (bio_integrity(bio)) {
+                int ret;
+                ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
+                if (ret < 0) {
+                        bio_put(b);
+                        return NULL;
+                }
+        }
+        return b;
+}
+EXPORT_SYMBOL_GPL(bio_clone_mddev);
 /*
 * We have a system wide 'event count' that is incremented
 * on any 'interesting' event, and readers of /proc/mdstat
@@ -220,18 +287,21 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
        mddev_t *mddev = q->queuedata;
        int rv;
        int cpu;
+        unsigned int sectors;
-        if (mddev == NULL || mddev->pers == NULL) {
+        if (mddev == NULL || mddev->pers == NULL
+            || !mddev->ready) {
                bio_io_error(bio);
                return 0;
        }
+        smp_rmb(); /* Ensure implications of  'active' are visible */
        rcu_read_lock();
-        if (mddev->suspended || mddev->barrier) {
+        if (mddev->suspended) {
                DEFINE_WAIT(__wait);
                for (;;) {
                        prepare_to_wait(&mddev->sb_wait, &__wait,
                                        TASK_UNINTERRUPTIBLE);
-                        if (!mddev->suspended && !mddev->barrier)
+                        if (!mddev->suspended)
                                break;
                        rcu_read_unlock();
                        schedule();
@@ -242,12 +312,16 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
        atomic_inc(&mddev->active_io);
        rcu_read_unlock();
+        /*
+         * save the sectors now since our bio can
+         * go away inside make_request
+         */
+        sectors = bio_sectors(bio);
        rv = mddev->pers->make_request(mddev, bio);
        cpu = part_stat_lock();
        part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-        part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+        part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
-                      bio_sectors(bio));
        part_stat_unlock();
        if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
@@ -277,48 +351,45 @@ void mddev_resume(mddev_t *mddev)
        mddev->suspended = 0;
        wake_up(&mddev->sb_wait);
        mddev->pers->quiesce(mddev, 0);
+        md_wakeup_thread(mddev->thread);
+        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 }
 EXPORT_SYMBOL_GPL(mddev_resume);
 int mddev_congested(mddev_t *mddev, int bits)
 {
-        if (mddev->barrier)
-                return 1;
        return mddev->suspended;
 }
 EXPORT_SYMBOL(mddev_congested);
 /*
- * Generic barrier handling for md
+ * Generic flush handling for md
 */
-#define POST_REQUEST_BARRIER ((void*)1)
+static void md_end_flush(struct bio *bio, int err)
-static void md_end_barrier(struct bio *bio, int err)
 {
        mdk_rdev_t *rdev = bio->bi_private;
        mddev_t *mddev = rdev->mddev;
-        if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
-                set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
        rdev_dec_pending(rdev, mddev);
        if (atomic_dec_and_test(&mddev->flush_pending)) {
-                if (mddev->barrier == POST_REQUEST_BARRIER) {
+                /* The pre-request flush has finished */
-                        /* This was a post-request barrier */
+                queue_work(md_wq, &mddev->flush_work);
-                        mddev->barrier = NULL;
-                        wake_up(&mddev->sb_wait);
-                } else
-                        /* The pre-request barrier has finished */
-                        schedule_work(&mddev->barrier_work);
        }
        bio_put(bio);
 }
-static void submit_barriers(mddev_t *mddev)
+static void md_submit_flush_data(struct work_struct *ws);
+static void submit_flushes(struct work_struct *ws)
 {
+        mddev_t *mddev = container_of(ws, mddev_t, flush_work);
        mdk_rdev_t *rdev;
+        INIT_WORK(&mddev->flush_work, md_submit_flush_data);
+        atomic_set(&mddev->flush_pending, 1);
        rcu_read_lock();
        list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0 &&
@@ -331,106 +402,107 @@ static void submit_barriers(mddev_t *mddev)
                        atomic_inc(&rdev->nr_pending);
                        atomic_inc(&rdev->nr_pending);
                        rcu_read_unlock();
-                        bi = bio_alloc(GFP_KERNEL, 0);
+                        bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
-                        bi->bi_end_io = md_end_barrier;
+                        bi->bi_end_io = md_end_flush;
                        bi->bi_private = rdev;
                        bi->bi_bdev = rdev->bdev;
                        atomic_inc(&mddev->flush_pending);
-                        submit_bio(WRITE_BARRIER, bi);
+                        submit_bio(WRITE_FLUSH, bi);
                        rcu_read_lock();
                        rdev_dec_pending(rdev, mddev);
                }
        rcu_read_unlock();
+        if (atomic_dec_and_test(&mddev->flush_pending))
+                queue_work(md_wq, &mddev->flush_work);
 }
-static void md_submit_barrier(struct work_struct *ws)
+static void md_submit_flush_data(struct work_struct *ws)
 {
-        mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
+        mddev_t *mddev = container_of(ws, mddev_t, flush_work);
-        struct bio *bio = mddev->barrier;
+        struct bio *bio = mddev->flush_bio;
-        atomic_set(&mddev->flush_pending, 1);
-        if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
+        if (bio->bi_size == 0)
-                bio_endio(bio, -EOPNOTSUPP);
-        else if (bio->bi_size == 0)
                /* an empty barrier - all done */
                bio_endio(bio, 0);
        else {
-                bio->bi_rw &= ~REQ_HARDBARRIER;
+                bio->bi_rw &= ~REQ_FLUSH;
                if (mddev->pers->make_request(mddev, bio))
                        generic_make_request(bio);
-                mddev->barrier = POST_REQUEST_BARRIER;
-                submit_barriers(mddev);
-        }
-        if (atomic_dec_and_test(&mddev->flush_pending)) {
-                mddev->barrier = NULL;
-                wake_up(&mddev->sb_wait);
        }
+        mddev->flush_bio = NULL;
+        wake_up(&mddev->sb_wait);
 }
-void md_barrier_request(mddev_t *mddev, struct bio *bio)
+void md_flush_request(mddev_t *mddev, struct bio *bio)
 {
        spin_lock_irq(&mddev->write_lock);
        wait_event_lock_irq(mddev->sb_wait,
-                            !mddev->barrier,
+                            !mddev->flush_bio,
                            mddev->write_lock, /*nothing*/);
-        mddev->barrier = bio;
+        mddev->flush_bio = bio;
        spin_unlock_irq(&mddev->write_lock);
-        atomic_set(&mddev->flush_pending, 1);
+        INIT_WORK(&mddev->flush_work, submit_flushes);
-        INIT_WORK(&mddev->barrier_work, md_submit_barrier);
+        queue_work(md_wq, &mddev->flush_work);
-        submit_barriers(mddev);
-        if (atomic_dec_and_test(&mddev->flush_pending))
-                schedule_work(&mddev->barrier_work);
 }
-EXPORT_SYMBOL(md_barrier_request);
+EXPORT_SYMBOL(md_flush_request);
 /* Support for plugging.
 * This mirrors the plugging support in request_queue, but does not
- * require having a whole queue
+ * require having a whole queue or request structures.
+ * We allocate an md_plug_cb for each md device and each thread it gets
+ * plugged on.  This links tot the private plug_handle structure in the
+ * personality data where we keep a count of the number of outstanding
+ * plugs so other code can see if a plug is active.
 */
-static void plugger_work(struct work_struct *work)
+struct md_plug_cb {
-{
+        struct blk_plug_cb cb;
-        struct plug_handle *plug =
+        mddev_t *mddev;
-                container_of(work, struct plug_handle, unplug_work);
+};
-        plug->unplug_fn(plug);
-}
-static void plugger_timeout(unsigned long data)
-{
-        struct plug_handle *plug = (void *)data;
-        kblockd_schedule_work(NULL, &plug->unplug_work);
-}
-void plugger_init(struct plug_handle *plug,
-                  void (*unplug_fn)(struct plug_handle *))
-{
-        plug->unplug_flag = 0;
-        plug->unplug_fn = unplug_fn;
-        init_timer(&plug->unplug_timer);
-        plug->unplug_timer.function = plugger_timeout;
-        plug->unplug_timer.data = (unsigned long)plug;
-        INIT_WORK(&plug->unplug_work, plugger_work);
-}
-EXPORT_SYMBOL_GPL(plugger_init);
-void plugger_set_plug(struct plug_handle *plug)
+static void plugger_unplug(struct blk_plug_cb *cb)
 {
-        if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag))
+        struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
-                mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1);
+        if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
+                md_wakeup_thread(mdcb->mddev->thread);
+        kfree(mdcb);
 }
-EXPORT_SYMBOL_GPL(plugger_set_plug);
-int plugger_remove_plug(struct plug_handle *plug)
+/* Check that an unplug wakeup will come shortly.
+ * If not, wakeup the md thread immediately
+ */
+int mddev_check_plugged(mddev_t *mddev)
 {
-        if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) {
+        struct blk_plug *plug = current->plug;
-                del_timer(&plug->unplug_timer);
+        struct md_plug_cb *mdcb;
-                return 1;
-        } else
+        if (!plug)
                return 0;
-}
-EXPORT_SYMBOL_GPL(plugger_remove_plug);
+        list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
+                if (mdcb->cb.callback == plugger_unplug &&
+                    mdcb->mddev == mddev) {
+                        /* Already on the list, move to top */
+                        if (mdcb != list_first_entry(&plug->cb_list,
+                                                    struct md_plug_cb,
+                                                    cb.list))
+                                list_move(&mdcb->cb.list, &plug->cb_list);
+                        return 1;
+                }
+        }
+        /* Not currently on the callback list */
+        mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
+        if (!mdcb)
+                return 0;
+        mdcb->mddev = mddev;
+        mdcb->cb.callback = plugger_unplug;
+        atomic_inc(&mddev->plug_cnt);
+        list_add(&mdcb->cb.list, &plug->cb_list);
+        return 1;
+}
+EXPORT_SYMBOL_GPL(mddev_check_plugged);
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
@@ -442,6 +514,8 @@ static void mddev_delayed_delete(struct work_struct *ws);
 static void mddev_put(mddev_t *mddev)
 {
+        struct bio_set *bs = NULL;
        if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
                return;
        if (!mddev->raid_disks && list_empty(&mddev->disks) &&
@@ -449,19 +523,22 @@ static void mddev_put(mddev_t *mddev)
                /* Array is not configured at all, and not held active,
                 * so destroy it */
                list_del(&mddev->all_mddevs);
+                bs = mddev->bio_set;
+                mddev->bio_set = NULL;
                if (mddev->gendisk) {
-                        /* we did a probe so need to clean up.
+                        /* We did a probe so need to clean up.  Call
-                         * Call schedule_work inside the spinlock
+                         * queue_work inside the spinlock so that
-                         * so that flush_scheduled_work() after
+                         * flush_workqueue() after mddev_find will
-                         * mddev_find will succeed in waiting for the
+                         * succeed in waiting for the work to be done.
-                         * work to be done.
                         */
                        INIT_WORK(&mddev->del_work, mddev_delayed_delete);
-                        schedule_work(&mddev->del_work);
+                        queue_work(md_misc_wq, &mddev->del_work);
                } else
                        kfree(mddev);
        }
        spin_unlock(&all_mddevs_lock);
+        if (bs)
+                bioset_free(bs);
 }
 void mddev_init(mddev_t *mddev)
@@ -475,6 +552,7 @@ void mddev_init(mddev_t *mddev)
        atomic_set(&mddev->active, 1);
        atomic_set(&mddev->openers, 0);
        atomic_set(&mddev->active_io, 0);
+        atomic_set(&mddev->plug_cnt, 0);
        spin_lock_init(&mddev->write_lock);
        atomic_set(&mddev->flush_pending, 0);
        init_waitqueue_head(&mddev->sb_wait);
@@ -490,6 +568,9 @@ static mddev_t * mddev_find(dev_t unit)
 {
        mddev_t *mddev, *new = NULL;
+        if (unit && MAJOR(unit) != MD_MAJOR)
+                unit &= ~((1<<MdpMinorShift)-1);
 retry:
        spin_lock(&all_mddevs_lock);
@@ -647,9 +728,9 @@ static struct mdk_personality *find_pers(int level, char *clevel)
 }
 /* return the offset of the super block in 512byte sectors */
-static inline sector_t calc_dev_sboffset(struct block_device *bdev)
+static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
 {
-        sector_t num_sectors = bdev->bd_inode->i_size / 512;
+        sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
        return MD_NEW_SIZE_SECTORS(num_sectors);
 }
@@ -696,31 +777,6 @@ static void super_written(struct bio *bio, int error)
        bio_put(bio);
 }
-static void super_written_barrier(struct bio *bio, int error)
-{
-        struct bio *bio2 = bio->bi_private;
-        mdk_rdev_t *rdev = bio2->bi_private;
-        mddev_t *mddev = rdev->mddev;
-        if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
-            error == -EOPNOTSUPP) {
-                unsigned long flags;
-                /* barriers don't appear to be supported :-( */
-                set_bit(BarriersNotsupp, &rdev->flags);
-                mddev->barriers_work = 0;
-                spin_lock_irqsave(&mddev->write_lock, flags);
-                bio2->bi_next = mddev->biolist;
-                mddev->biolist = bio2;
-                spin_unlock_irqrestore(&mddev->write_lock, flags);
-                wake_up(&mddev->sb_wait);
-                bio_put(bio);
-        } else {
-                bio_put(bio2);
-                bio->bi_private = rdev;
-                super_written(bio, error);
-        }
-}
 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
                   sector_t sector, int size, struct page *page)
 {
@@ -729,51 +785,27 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
         * and decrement it on completion, waking up sb_wait
         * if zero is reached.
         * If an error occurred, call md_error
-         *
-         * As we might need to resubmit the request if REQ_HARDBARRIER
-         * causes ENOTSUPP, we allocate a spare bio...
         */
-        struct bio *bio = bio_alloc(GFP_NOIO, 1);
+        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
-        int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
-        bio->bi_bdev = rdev->bdev;
+        bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
        bio->bi_sector = sector;
        bio_add_page(bio, page, size, 0);
        bio->bi_private = rdev;
        bio->bi_end_io = super_written;
-        bio->bi_rw = rw;
        atomic_inc(&mddev->pending_writes);
-        if (!test_bit(BarriersNotsupp, &rdev->flags)) {
+        submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
-                struct bio *rbio;
-                rw |= REQ_HARDBARRIER;
-                rbio = bio_clone(bio, GFP_NOIO);
-                rbio->bi_private = bio;
-                rbio->bi_end_io = super_written_barrier;
-                submit_bio(rw, rbio);
-        } else
-                submit_bio(rw, bio);
 }
 void md_super_wait(mddev_t *mddev)
 {
-        /* wait for all superblock writes that were scheduled to complete.
+        /* wait for all superblock writes that were scheduled to complete */
-         * if any had to be retried (due to BARRIER problems), retry them
-         */
        DEFINE_WAIT(wq);
        for(;;) {
                prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
                if (atomic_read(&mddev->pending_writes)==0)
                        break;
-                while (mddev->biolist) {
-                        struct bio *bio;
-                        spin_lock_irq(&mddev->write_lock);
-                        bio = mddev->biolist;
-                        mddev->biolist = bio->bi_next ;
-                        bio->bi_next = NULL;
-                        spin_unlock_irq(&mddev->write_lock);
-                        submit_bio(bio->bi_rw, bio);
-                }
                schedule();
        }
        finish_wait(&mddev->sb_wait, &wq);
@@ -784,17 +816,21 @@ static void bi_complete(struct bio *bio, int error)
        complete((struct completion*)bio->bi_private);
 }
-int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
-                   struct page *page, int rw)
+                 struct page *page, int rw, bool metadata_op)
 {
-        struct bio *bio = bio_alloc(GFP_NOIO, 1);
+        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
        struct completion event;
        int ret;
-        rw |= REQ_SYNC | REQ_UNPLUG;
+        rw |= REQ_SYNC;
-        bio->bi_bdev = bdev;
+        bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
-        bio->bi_sector = sector;
+                rdev->meta_bdev : rdev->bdev;
+        if (metadata_op)
+                bio->bi_sector = sector + rdev->sb_start;
+        else
+                bio->bi_sector = sector + rdev->data_offset;
        bio_add_page(bio, page, size, 0);
        init_completion(&event);
        bio->bi_private = &event;
@@ -819,7 +855,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
                return 0;
-        if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
+        if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
                goto fail;
        rdev->sb_loaded = 1;
        return 0;
@@ -981,7 +1017,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
         *
         * It also happens to be a multiple of 4Kb.
         */
-        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+        rdev->sb_start = calc_dev_sboffset(rdev);
        ret = read_disk_sb(rdev, MD_SB_BYTES);
        if (ret) return ret;
@@ -1070,7 +1106,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
        clear_bit(Faulty, &rdev->flags);
        clear_bit(In_sync, &rdev->flags);
        clear_bit(WriteMostly, &rdev->flags);
-        clear_bit(BarriersNotsupp, &rdev->flags);
        if (mddev->raid_disks == 0) {
                mddev->major_version = 0;
@@ -1323,13 +1358,13 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
                return 0; /* component must fit device */
        if (rdev->mddev->bitmap_info.offset)
                return 0; /* can't move bitmap */
-        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+        rdev->sb_start = calc_dev_sboffset(rdev);
        if (!num_sectors || num_sectors > rdev->sb_start)
                num_sectors = rdev->sb_start;
        md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
                       rdev->sb_page);
        md_super_wait(rdev->mddev);
-        return num_sectors / 2; /* kB for sysfs */
+        return num_sectors;
 }
@@ -1378,7 +1413,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
         */
        switch(minor_version) {
        case 0:
-                sb_start = rdev->bdev->bd_inode->i_size >> 9;
+                sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
                sb_start -= 8*2;
                sb_start &= ~(sector_t)(4*2-1);
                break;
@@ -1464,7 +1499,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
                        ret = 0;
        }
        if (minor_version)
-                rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
+                rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
                        le64_to_cpu(sb->data_offset);
        else
                rdev->sectors = rdev->sb_start;
@@ -1485,7 +1520,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
        clear_bit(Faulty, &rdev->flags);
        clear_bit(In_sync, &rdev->flags);
        clear_bit(WriteMostly, &rdev->flags);
-        clear_bit(BarriersNotsupp, &rdev->flags);
        if (mddev->raid_disks == 0) {
                mddev->major_version = 1;
@@ -1673,7 +1707,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
                return 0; /* component must fit device */
        if (rdev->sb_start < rdev->data_offset) {
                /* minor versions 1 and 2; superblock before data */
-                max_sectors = rdev->bdev->bd_inode->i_size >> 9;
+                max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
                max_sectors -= rdev->data_offset;
                if (!num_sectors || num_sectors > max_sectors)
                        num_sectors = max_sectors;
@@ -1683,7 +1717,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
        } else {
                /* minor version 0; superblock after data */
                sector_t sb_start;
-                sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
+                sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
                sb_start &= ~(sector_t)(4*2 - 1);
                max_sectors = rdev->sectors + sb_start - rdev->sb_start;
                if (!num_sectors || num_sectors > max_sectors)
@@ -1697,7 +1731,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
        md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
                       rdev->sb_page);
        md_super_wait(rdev->mddev);
-        return num_sectors / 2; /* kB for sysfs */
+        return num_sectors;
 }
 static struct super_type super_types[] = {
@@ -1719,6 +1753,18 @@ static struct super_type super_types[] = {
        },
 };
+static void sync_super(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        if (mddev->sync_super) {
+                mddev->sync_super(mddev, rdev);
+                return;
+        }
+        BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
+        super_types[mddev->major_version].sync_super(mddev, rdev);
+}
 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
 {
        mdk_rdev_t *rdev, *rdev2;
@@ -1750,20 +1796,14 @@ int md_integrity_register(mddev_t *mddev)
        if (list_empty(&mddev->disks))
                return 0; /* nothing to do */
-        if (blk_get_integrity(mddev->gendisk))
+        if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
-                return 0; /* already registered */
+                return 0; /* shouldn't register, or already is */
        list_for_each_entry(rdev, &mddev->disks, same_set) {
                /* skip spares and non-functional disks */
                if (test_bit(Faulty, &rdev->flags))
                        continue;
                if (rdev->raid_disk < 0)
                        continue;
-                /*
-                 * If at least one rdev is not integrity capable, we can not
-                 * enable data integrity for the md device.
-                 */
-                if (!bdev_get_integrity(rdev->bdev))
-                        return -EINVAL;
                if (!reference) {
                        /* Use the first rdev as the reference */
                        reference = rdev;
@@ -1774,6 +1814,8 @@ int md_integrity_register(mddev_t *mddev)
                                rdev->bdev->bd_disk) < 0)
                        return -EINVAL;
        }
+        if (!reference || !bdev_get_integrity(reference->bdev))
+                return 0;
        /*
         * All component devices are integrity capable and have matching
         * profiles, register the common profile for the md device.
@@ -1784,8 +1826,12 @@ int md_integrity_register(mddev_t *mddev)
                        mdname(mddev));
                return -EINVAL;
        }
-        printk(KERN_NOTICE "md: data integrity on %s enabled\n",
+        printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
-                mdname(mddev));
+        if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
+                printk(KERN_ERR "md: failed to create integrity pool for %s\n",
+                       mdname(mddev));
+                return -EINVAL;
+        }
        return 0;
 }
 EXPORT_SYMBOL(md_integrity_register);
@@ -1873,7 +1919,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
        rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
        list_add_rcu(&rdev->same_set, &mddev->disks);
-        bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
+        bd_link_disk_holder(rdev->bdev, mddev->gendisk);
        /* May as well allow recovery to be retried once */
        mddev->recovery_disabled = 0;
@@ -1900,7 +1946,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
                MD_BUG();
                return;
        }
-        bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
+        bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
        list_del_rcu(&rdev->same_set);
        printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
        rdev->mddev = NULL;
@@ -1914,7 +1960,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
        synchronize_rcu();
        INIT_WORK(&rdev->del_work, md_delayed_delete);
        kobject_get(&rdev->kobj);
-        schedule_work(&rdev->del_work);
+        queue_work(md_misc_wq, &rdev->del_work);
 }
 /*
@@ -1928,21 +1974,13 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
-        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                 shared ? (mdk_rdev_t *)lock_rdev : rdev);
        if (IS_ERR(bdev)) {
                printk(KERN_ERR "md: could not open %s.\n",
                        __bdevname(dev, b));
                return PTR_ERR(bdev);
        }
-        err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
-        if (err) {
-                printk(KERN_ERR "md: could not bd_claim %s.\n",
-                        bdevname(bdev, b));
-                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-                return err;
-        }
-        if (!shared)
-                set_bit(AllReserved, &rdev->flags);
        rdev->bdev = bdev;
        return err;
 }
@@ -1953,8 +1991,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
        rdev->bdev = NULL;
        if (!bdev)
                MD_BUG();
-        bd_release(bdev);
+        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-        blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 void md_autodetect_dev(dev_t dev);
@@ -2146,8 +2183,7 @@ static void sync_sbs(mddev_t * mddev, int nospares)
                        /* Don't update this superblock */
                        rdev->sb_loaded = 2;
                } else {
-                        super_types[mddev->major_version].
+                        sync_super(mddev, rdev);
-                                sync_super(mddev, rdev);
                        rdev->sb_loaded = 1;
                }
        }
@@ -2172,6 +2208,8 @@ repeat:
        if (!mddev->persistent) {
                clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
                clear_bit(MD_CHANGE_DEVS, &mddev->flags);
+                if (!mddev->external)
+                        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
                wake_up(&mddev->sb_wait);
                return;
        }
@@ -2438,7 +2476,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                if (rdev->raid_disk == -1)
                        return -EEXIST;
                /* personality does all needed checks */
-                if (rdev->mddev->pers->hot_add_disk == NULL)
+                if (rdev->mddev->pers->hot_remove_disk == NULL)
                        return -EINVAL;
                err = rdev->mddev->pers->
                        hot_remove_disk(rdev->mddev, rdev->raid_disk);
@@ -2458,6 +2496,9 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                if (rdev->raid_disk != -1)
                        return -EBUSY;
+                if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
+                        return -EBUSY;
                if (rdev->mddev->pers->hot_add_disk == NULL)
                        return -EINVAL;
@@ -2465,6 +2506,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                        if (rdev2->raid_disk == slot)
                                return -EEXIST;
+                if (slot >= rdev->mddev->raid_disks &&
+                    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
+                        return -ENOSPC;
                rdev->raid_disk = slot;
                if (test_bit(In_sync, &rdev->flags))
                        rdev->saved_raid_disk = slot;
@@ -2482,7 +2527,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                        /* failure here is OK */;
                /* don't wakeup anyone, leave that to userspace. */
        } else {
-                if (slot >= rdev->mddev->raid_disks)
+                if (slot >= rdev->mddev->raid_disks &&
+                    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
                        return -ENOSPC;
                rdev->raid_disk = slot;
                /* assume it is working */
@@ -2575,7 +2621,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                        if (!sectors)
                                return -EBUSY;
                } else if (!sectors)
-                        sectors = (rdev->bdev->bd_inode->i_size >> 9) -
+                        sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
                                rdev->data_offset;
        }
        if (sectors < my_mddev->dev_sectors)
@@ -2598,12 +2644,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                        mddev_lock(mddev);
                        list_for_each_entry(rdev2, &mddev->disks, same_set)
-                                if (test_bit(AllReserved, &rdev2->flags) ||
+                                if (rdev->bdev == rdev2->bdev &&
-                                    (rdev->bdev == rdev2->bdev &&
+                                    rdev != rdev2 &&
-                                     rdev != rdev2 &&
+                                    overlaps(rdev->data_offset, rdev->sectors,
-                                     overlaps(rdev->data_offset, rdev->sectors,
+                                             rdev2->data_offset,
-                                              rdev2->data_offset,
+                                             rdev2->sectors)) {
-                                              rdev2->sectors))) {
                                        overlap = 1;
                                        break;
                                }
@@ -2788,7 +2833,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
        kobject_init(&rdev->kobj, &rdev_ktype);
-        size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+        size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
        if (!size) {
                printk(KERN_WARNING 
                        "md: %s has zero or unknown size, marking faulty!\n",
@@ -3107,7 +3152,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
                char nm[20];
                if (rdev->raid_disk < 0)
                        continue;
-                if (rdev->new_raid_disk > mddev->raid_disks)
+                if (rdev->new_raid_disk >= mddev->raid_disks)
                        rdev->new_raid_disk = -1;
                if (rdev->new_raid_disk == rdev->raid_disk)
                        continue;
@@ -3139,6 +3184,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
        mddev->layout = mddev->new_layout;
        mddev->chunk_sectors = mddev->new_chunk_sectors;
        mddev->delta_disks = 0;
+        mddev->degraded = 0;
        if (mddev->pers->sync_request == NULL) {
                /* this is now an array without redundancy, so
                 * it must always be in_sync
@@ -3292,7 +3338,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
        char *e;
        unsigned long long n = simple_strtoull(buf, &e, 10);
-        if (mddev->pers)
+        if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
                return -EBUSY;
        if (cmd_match(buf, "none"))
                n = MaxSector;
@@ -3736,6 +3782,8 @@ action_show(mddev_t *mddev, char *page)
        return sprintf(page, "%s\n", type);
 }
+static void reap_sync_thread(mddev_t *mddev);
 static ssize_t
 action_store(mddev_t *mddev, const char *page, size_t len)
 {
@@ -3750,9 +3798,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
        if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
                if (mddev->sync_thread) {
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                        md_unregister_thread(mddev->sync_thread);
+                        reap_sync_thread(mddev);
-                        mddev->sync_thread = NULL;
-                        mddev->recovery = 0;
                }
        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
                   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -3904,7 +3950,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
 static ssize_t
 sync_completed_show(mddev_t *mddev, char *page)
 {
-        unsigned long max_sectors, resync;
+        unsigned long long max_sectors, resync;
        if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                return sprintf(page, "none\n");
@@ -3915,7 +3961,7 @@ sync_completed_show(mddev_t *mddev, char *page)
                max_sectors = mddev->dev_sectors;
        resync = mddev->curr_resync_completed;
-        return sprintf(page, "%lu / %lu\n", resync, max_sectors);
+        return sprintf(page, "%llu / %llu\n", resync, max_sectors);
 }
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -4002,19 +4048,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        unsigned long long new = simple_strtoull(buf, &e, 10);
+        unsigned long long old = mddev->suspend_lo;
        if (mddev->pers == NULL || 
            mddev->pers->quiesce == NULL)
                return -EINVAL;
        if (buf == e || (*e && *e != '\n'))
                return -EINVAL;
-        if (new >= mddev->suspend_hi ||
-            (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
+        mddev->suspend_lo = new;
-                mddev->suspend_lo = new;
+        if (new >= old)
+                /* Shrinking suspended region */
                mddev->pers->quiesce(mddev, 2);
-                return len;
+        else {
-        } else
+                /* Expanding suspended region - need to wait */
-                return -EINVAL;
+                mddev->pers->quiesce(mddev, 1);
+                mddev->pers->quiesce(mddev, 0);
+        }
+        return len;
 }
 static struct md_sysfs_entry md_suspend_lo =
 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4031,20 +4082,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        unsigned long long new = simple_strtoull(buf, &e, 10);
+        unsigned long long old = mddev->suspend_hi;
        if (mddev->pers == NULL ||
            mddev->pers->quiesce == NULL)
                return -EINVAL;
        if (buf == e || (*e && *e != '\n'))
                return -EINVAL;
-        if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
-            (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
+        mddev->suspend_hi = new;
-                mddev->suspend_hi = new;
+        if (new <= old)
+                /* Shrinking suspended region */
+                mddev->pers->quiesce(mddev, 2);
+        else {
+                /* Expanding suspended region - need to wait */
                mddev->pers->quiesce(mddev, 1);
                mddev->pers->quiesce(mddev, 0);
-                return len;
+        }
-        } else
+        return len;
-                return -EINVAL;
 }
 static struct md_sysfs_entry md_suspend_hi =
 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4112,10 +4167,10 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len)
        }
        mddev->array_sectors = sectors;
-        set_capacity(mddev->gendisk, mddev->array_sectors);
+        if (mddev->pers) {
-        if (mddev->pers)
+                set_capacity(mddev->gendisk, mddev->array_sectors);
                revalidate_disk(mddev->gendisk);
+        }
        return len;
 }
@@ -4256,10 +4311,10 @@ static int md_alloc(dev_t dev, char *name)
        shift = partitioned ? MdpMinorShift : 0;
        unit = MINOR(mddev->unit) >> shift;
-        /* wait for any previous instance if this device
+        /* wait for any previous instance of this device to be
-         * to be completed removed (mddev_delayed_delete).
+         * completely removed (mddev_delayed_delete).
         */
-        flush_scheduled_work();
+        flush_workqueue(md_misc_wq);
        mutex_lock(&disks_mutex);
        error = -EEXIST;
@@ -4287,9 +4342,6 @@ static int md_alloc(dev_t dev, char *name)
                goto abort;
        mddev->queue->queuedata = mddev;
-        /* Can be unlocked because the queue is new: no concurrency */
-        queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
        blk_queue_make_request(mddev->queue, md_make_request);
        disk = alloc_disk(1 << shift);
@@ -4309,13 +4361,19 @@ static int md_alloc(dev_t dev, char *name)
        disk->fops = &md_fops;
        disk->private_data = mddev;
        disk->queue = mddev->queue;
+        blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
        /* Allow extended partitions.  This makes the
         * 'mdp' device redundant, but we can't really
         * remove it now.
         */
        disk->flags |= GENHD_FL_EXT_DEVT;
-        add_disk(disk);
        mddev->gendisk = disk;
+        /* As soon as we call add_disk(), another thread could get
+         * through to md_open, so make sure it doesn't get too far
+         */
+        mutex_lock(&mddev->open_mutex);
+        add_disk(disk);
        error = kobject_init_and_add(&mddev->kobj, &md_ktype,
                                     &disk_to_dev(disk)->kobj, "%s", "md");
        if (error) {
@@ -4329,6 +4387,7 @@ static int md_alloc(dev_t dev, char *name)
        if (mddev->kobj.sd &&
            sysfs_create_group(&mddev->kobj, &md_bitmap_group))
                printk(KERN_DEBUG "pointless warning\n");
+        mutex_unlock(&mddev->open_mutex);
 abort:
        mutex_unlock(&disks_mutex);
        if (!error && mddev->kobj.sd) {
@@ -4423,7 +4482,9 @@ int md_run(mddev_t *mddev)
                 * We don't want the data to overlap the metadata,
                 * Internal Bitmap issues have been handled elsewhere.
                 */
-                if (rdev->data_offset < rdev->sb_start) {
+                if (rdev->meta_bdev) {
+                        /* Nothing to check */;
+                } else if (rdev->data_offset < rdev->sb_start) {
                        if (mddev->dev_sectors &&
                            rdev->data_offset + mddev->dev_sectors
                            > rdev->sb_start) {
@@ -4442,6 +4503,9 @@ int md_run(mddev_t *mddev)
                sysfs_notify_dirent_safe(rdev->sysfs_state);
        }
+        if (mddev->bio_set == NULL)
+                mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev));
        spin_lock(&pers_lock);
        pers = find_pers(mddev->level, mddev->clevel);
        if (!pers || !try_module_get(pers->owner)) {
@@ -4504,7 +4568,6 @@ int md_run(mddev_t *mddev)
        /* may be over-ridden by personality */
        mddev->resync_max_sectors = mddev->dev_sectors;
-        mddev->barriers_work = 1;
        mddev->ok_start_degraded = start_dirty_degraded;
        if (start_readonly && mddev->ro == 0)
@@ -4555,7 +4618,8 @@ int md_run(mddev_t *mddev)
        mddev->safemode_timer.data = (unsigned long) mddev;
        mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
        mddev->in_sync = 1;
+        smp_wmb();
+        mddev->ready = 1;
        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0) {
                        char nm[20];
@@ -4569,9 +4633,6 @@ int md_run(mddev_t *mddev)
        if (mddev->flags)
                md_update_sb(mddev, 0);
-        md_wakeup_thread(mddev->thread);
-        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
        md_new_event(mddev);
        sysfs_notify_dirent_safe(mddev->sysfs_state);
        sysfs_notify_dirent_safe(mddev->sysfs_action);
@@ -4592,8 +4653,13 @@ static int do_md_run(mddev_t *mddev)
                bitmap_destroy(mddev);
                goto out;
        }
+        md_wakeup_thread(mddev->thread);
+        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
        set_capacity(mddev->gendisk, mddev->array_sectors);
        revalidate_disk(mddev->gendisk);
+        mddev->changed = 1;
        kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
 out:
        return err;
@@ -4682,24 +4748,22 @@ static void md_clean(mddev_t *mddev)
        mddev->sync_speed_min = mddev->sync_speed_max = 0;
        mddev->recovery = 0;
        mddev->in_sync = 0;
+        mddev->changed = 0;
        mddev->degraded = 0;
-        mddev->barriers_work = 0;
        mddev->safemode = 0;
        mddev->bitmap_info.offset = 0;
        mddev->bitmap_info.default_offset = 0;
        mddev->bitmap_info.chunksize = 0;
        mddev->bitmap_info.daemon_sleep = 0;
        mddev->bitmap_info.max_write_behind = 0;
-        mddev->plug = NULL;
 }
-void md_stop_writes(mddev_t *mddev)
+static void __md_stop_writes(mddev_t *mddev)
 {
        if (mddev->sync_thread) {
                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                md_unregister_thread(mddev->sync_thread);
+                reap_sync_thread(mddev);
-                mddev->sync_thread = NULL;
        }
        del_timer_sync(&mddev->safemode_timer);
@@ -4713,10 +4777,18 @@ void md_stop_writes(mddev_t *mddev)
                md_update_sb(mddev, 1);
        }
 }
+void md_stop_writes(mddev_t *mddev)
+{
+        mddev_lock(mddev);
+        __md_stop_writes(mddev);
+        mddev_unlock(mddev);
+}
 EXPORT_SYMBOL_GPL(md_stop_writes);
 void md_stop(mddev_t *mddev)
 {
+        mddev->ready = 0;
        mddev->pers->stop(mddev);
        if (mddev->pers->sync_request && mddev->to_remove == NULL)
                mddev->to_remove = &md_redundancy_group;
@@ -4736,7 +4808,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open)
                goto out;
        }
        if (mddev->pers) {
-                md_stop_writes(mddev);
+                __md_stop_writes(mddev);
                err  = -ENXIO;
                if (mddev->ro==1)
@@ -4773,10 +4845,9 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                if (mddev->ro)
                        set_disk_ro(disk, 0);
-                md_stop_writes(mddev);
+                __md_stop_writes(mddev);
                md_stop(mddev);
                mddev->queue->merge_bvec_fn = NULL;
-                mddev->queue->unplug_fn = NULL;
                mddev->queue->backing_dev_info.congested_fn = NULL;
                /* tell userspace to handle 'inactive' */
@@ -4791,6 +4862,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                set_capacity(disk, 0);
                mutex_unlock(&mddev->open_mutex);
+                mddev->changed = 1;
                revalidate_disk(disk);
                if (mddev->ro)
@@ -5148,17 +5220,31 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                                PTR_ERR(rdev));
                        return PTR_ERR(rdev);
                }
-                /* set save_raid_disk if appropriate */
+                /* set saved_raid_disk if appropriate */
                if (!mddev->persistent) {
                        if (info->state & (1<<MD_DISK_SYNC)  &&
-                            info->raid_disk < mddev->raid_disks)
+                            info->raid_disk < mddev->raid_disks) {
                                rdev->raid_disk = info->raid_disk;
-                        else
+                                set_bit(In_sync, &rdev->flags);
+                        } else
                                rdev->raid_disk = -1;
                } else
                        super_types[mddev->major_version].
                                validate_super(mddev, rdev);
-                rdev->saved_raid_disk = rdev->raid_disk;
+                if ((info->state & (1<<MD_DISK_SYNC)) &&
+                    (!test_bit(In_sync, &rdev->flags) ||
+                     rdev->raid_disk != info->raid_disk)) {
+                        /* This was a hot-add request, but events doesn't
+                         * match, so reject it.
+                         */
+                        export_rdev(rdev);
+                        return -EINVAL;
+                }
+                if (test_bit(In_sync, &rdev->flags))
+                        rdev->saved_raid_disk = rdev->raid_disk;
+                else
+                        rdev->saved_raid_disk = -1;
                clear_bit(In_sync, &rdev->flags); /* just to be sure */
                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
@@ -5188,6 +5274,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                if (mddev->degraded)
                        set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+                if (!err)
+                        md_new_event(mddev);
                md_wakeup_thread(mddev->thread);
                return err;
        }
@@ -5225,9 +5313,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                if (!mddev->persistent) {
                        printk(KERN_INFO "md: nonpersistent superblock ...\n");
-                        rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
+                        rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
-                } else 
+                } else
-                        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+                        rdev->sb_start = calc_dev_sboffset(rdev);
                rdev->sectors = rdev->sb_start;
                err = bind_rdev_to_array(rdev, mddev);
@@ -5294,9 +5382,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
        }
        if (mddev->persistent)
-                rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+                rdev->sb_start = calc_dev_sboffset(rdev);
        else
-                rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
+                rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
        rdev->sectors = rdev->sb_start;
@@ -5507,7 +5595,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
         * sb_start or, if that is <data_offset, it must fit before the size
         * of each device.  If num_sectors is zero, we find the largest size
         * that fits.
         */
        if (mddev->sync_thread)
                return -EBUSY;
@@ -5544,6 +5631,8 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
        mddev->delta_disks = raid_disks - mddev->raid_disks;
        rv = mddev->pers->check_reshape(mddev);
+        if (rv < 0)
+                mddev->delta_disks = 0;
        return rv;
 }
@@ -5951,16 +6040,14 @@ static int md_open(struct block_device *bdev, fmode_t mode)
        mddev_t *mddev = mddev_find(bdev->bd_dev);
        int err;
-        lock_kernel();
        if (mddev->gendisk != bdev->bd_disk) {
                /* we are racing with mddev_put which is discarding this
                 * bd_disk.
                 */
                mddev_put(mddev);
                /* Wait until bdev->bd_disk is definitely gone */
-                flush_scheduled_work();
+                flush_workqueue(md_misc_wq);
                /* Then retry the open from the top */
-                unlock_kernel();
                return -ERESTARTSYS;
        }
        BUG_ON(mddev != bdev->bd_disk->private_data);
@@ -5972,9 +6059,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
        atomic_inc(&mddev->openers);
        mutex_unlock(&mddev->open_mutex);
-        check_disk_size_change(mddev->gendisk, bdev);
+        check_disk_change(bdev);
 out:
-        unlock_kernel();
        return err;
 }
@@ -5983,13 +6069,26 @@ static int md_release(struct gendisk *disk, fmode_t mode)
        mddev_t *mddev = disk->private_data;
        BUG_ON(!mddev);
-        lock_kernel();
        atomic_dec(&mddev->openers);
        mddev_put(mddev);
-        unlock_kernel();
        return 0;
 }
+static int md_media_changed(struct gendisk *disk)
+{
+        mddev_t *mddev = disk->private_data;
+        return mddev->changed;
+}
+static int md_revalidate(struct gendisk *disk)
+{
+        mddev_t *mddev = disk->private_data;
+        mddev->changed = 0;
+        return 0;
+}
 static const struct block_device_operations md_fops =
 {
        .owner          = THIS_MODULE,
@@ -6000,6 +6099,8 @@ static const struct block_device_operations md_fops =
        .compat_ioctl   = md_compat_ioctl,
 #endif
        .getgeo         = md_getgeo,
+        .media_changed  = md_media_changed,
+        .revalidate_disk= md_revalidate,
 };
 static int md_thread(void * arg)
@@ -6036,8 +6137,8 @@ static int md_thread(void * arg)
                         thread->timeout);
                clear_bit(THREAD_WAKEUP, &thread->flags);
+                if (!kthread_should_stop())
-                thread->run(thread->mddev);
+                        thread->run(thread->mddev);
        }
        return 0;
@@ -6118,7 +6219,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
        if (mddev->event_work.func)
-                schedule_work(&mddev->event_work);
+                queue_work(md_misc_wq, &mddev->event_work);
        md_new_event_inintr(mddev);
 }
@@ -6209,7 +6310,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
         * rt is a sector_t, so could be 32bit or 64bit.
         * So we divide before multiply in case it is 32bit and close
         * to the limit.
-         * We scale the divisor (db) by 32 to avoid loosing precision
+         * We scale the divisor (db) by 32 to avoid losing precision
         * near the end of resync when the number of remaining sectors
         * is close to 'db'.
         * We then divide rt by 32 after multiplying by db to compensate.
@@ -6631,14 +6732,6 @@ int md_allow_write(mddev_t *mddev)
 }
 EXPORT_SYMBOL_GPL(md_allow_write);
-void md_unplug(mddev_t *mddev)
-{
-        if (mddev->queue)
-                blk_unplug(mddev->queue);
-        if (mddev->plug)
-                mddev->plug->unplug_fn(mddev->plug);
-}
 #define SYNC_MARKS      10
 #define SYNC_MARK_STEP  (3*HZ)
 void md_do_sync(mddev_t *mddev)
@@ -6790,8 +6883,8 @@ void md_do_sync(mddev_t *mddev)
         * Tune reconstruction:
         */
        window = 32*(PAGE_SIZE/512);
-        printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
+        printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
-                window/2,(unsigned long long) max_sectors/2);
+                window/2, (unsigned long long)max_sectors/2);
        atomic_set(&mddev->recovery_active, 0);
        last_check = 0;
@@ -6802,7 +6895,7 @@ void md_do_sync(mddev_t *mddev)
                       desc, mdname(mddev));
                mddev->curr_resync = j;
        }
-        mddev->curr_resync_completed = mddev->curr_resync;
+        mddev->curr_resync_completed = j;
        while (j < max_sectors) {
                sector_t sectors;
@@ -6817,11 +6910,9 @@ void md_do_sync(mddev_t *mddev)
                     >= mddev->resync_max - mddev->curr_resync_completed
                            )) {
                        /* time to update curr_resync_completed */
-                        md_unplug(mddev);
                        wait_event(mddev->recovery_wait,
                                   atomic_read(&mddev->recovery_active) == 0);
-                        mddev->curr_resync_completed =
+                        mddev->curr_resync_completed = j;
-                                mddev->curr_resync;
                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
                }
@@ -6894,7 +6985,6 @@ void md_do_sync(mddev_t *mddev)
                 * about not overloading the IO subsystem. (things like an
                 * e2fsck being done on the RAID array should execute fast)
                 */
-                md_unplug(mddev);
                cond_resched();
                currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
@@ -6913,8 +7003,6 @@ void md_do_sync(mddev_t *mddev)
         * this also signals 'finished resyncing' to md_stop
         */
 out:
-        md_unplug(mddev);
        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
        /* tell personality that we are finished */
@@ -6957,9 +7045,6 @@ void md_do_sync(mddev_t *mddev)
        } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
                mddev->resync_min = mddev->curr_resync_completed;
        mddev->curr_resync = 0;
-        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
-                mddev->curr_resync_completed = 0;
-        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
        wake_up(&resync_wait);
        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
@@ -6977,7 +7062,6 @@ void md_do_sync(mddev_t *mddev)
 }
 EXPORT_SYMBOL_GPL(md_do_sync);
 static int remove_and_add_spares(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
@@ -7000,10 +7084,11 @@ static int remove_and_add_spares(mddev_t *mddev)
                        }
                }
-        if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
+        if (mddev->degraded && !mddev->recovery_disabled) {
                list_for_each_entry(rdev, &mddev->disks, same_set) {
                        if (rdev->raid_disk >= 0 &&
                            !test_bit(In_sync, &rdev->flags) &&
+                            !test_bit(Faulty, &rdev->flags) &&
                            !test_bit(Blocked, &rdev->flags))
                                spares++;
                        if (rdev->raid_disk < 0
@@ -7026,6 +7111,45 @@ static int remove_and_add_spares(mddev_t *mddev)
        }
        return spares;
 }
+static void reap_sync_thread(mddev_t *mddev)
+{
+        mdk_rdev_t *rdev;
+        /* resync has finished, collect result */
+        md_unregister_thread(mddev->sync_thread);
+        mddev->sync_thread = NULL;
+        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+                /* success...*/
+                /* activate any spares */
+                if (mddev->pers->spare_active(mddev))
+                        sysfs_notify(&mddev->kobj, NULL,
+                                     "degraded");
+        }
+        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+            mddev->pers->finish_reshape)
+                mddev->pers->finish_reshape(mddev);
+        md_update_sb(mddev, 1);
+        /* if array is no-longer degraded, then any saved_raid_disk
+         * information must be scrapped
+         */
+        if (!mddev->degraded)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
+                        rdev->saved_raid_disk = -1;
+        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+        clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+        clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+        /* flag recovery needed just to double check */
+        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+        sysfs_notify_dirent_safe(mddev->sysfs_action);
+        md_new_event(mddev);
+}
 /*
 * This routine is regularly called by all per-raid-array threads to
 * deal with generic issues like resync and super-block update.
@@ -7050,8 +7174,8 @@ static int remove_and_add_spares(mddev_t *mddev)
 */
 void md_check_recovery(mddev_t *mddev)
 {
-        mdk_rdev_t *rdev;
+        if (mddev->suspended)
+                return;
        if (mddev->bitmap)
                bitmap_daemon_work(mddev);
@@ -7087,7 +7211,20 @@ void md_check_recovery(mddev_t *mddev)
                        /* Only thing we do on a ro array is remove
                         * failed devices.
                         */
-                        remove_and_add_spares(mddev);
+                        mdk_rdev_t *rdev;
+                        list_for_each_entry(rdev, &mddev->disks, same_set)
+                                if (rdev->raid_disk >= 0 &&
+                                    !test_bit(Blocked, &rdev->flags) &&
+                                    test_bit(Faulty, &rdev->flags) &&
+                                    atomic_read(&rdev->nr_pending)==0) {
+                                        if (mddev->pers->hot_remove_disk(
+                                                    mddev, rdev->raid_disk)==0) {
+                                                char nm[20];
+                                                sprintf(nm,"rd%d", rdev->raid_disk);
+                                                sysfs_remove_link(&mddev->kobj, nm);
+                                                rdev->raid_disk = -1;
+                                        }
+                                }
                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                        goto unlock;
                }
@@ -7120,34 +7257,7 @@ void md_check_recovery(mddev_t *mddev)
                        goto unlock;
                }
                if (mddev->sync_thread) {
-                        /* resync has finished, collect result */
+                        reap_sync_thread(mddev);
-                        md_unregister_thread(mddev->sync_thread);
-                        mddev->sync_thread = NULL;
-                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
-                            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
-                                /* success...*/
-                                /* activate any spares */
-                                if (mddev->pers->spare_active(mddev))
-                                        sysfs_notify(&mddev->kobj, NULL,
-                                                     "degraded");
-                        }
-                        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-                            mddev->pers->finish_reshape)
-                                mddev->pers->finish_reshape(mddev);
-                        md_update_sb(mddev, 1);
-                        /* if array is no-longer degraded, then any saved_raid_disk
-                         * information must be scrapped
-                         */
-                        if (!mddev->degraded)
-                                list_for_each_entry(rdev, &mddev->disks, same_set)
-                                        rdev->saved_raid_disk = -1;
-                        mddev->recovery = 0;
-                        /* flag recovery needed just to double check */
-                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-                        sysfs_notify_dirent_safe(mddev->sysfs_action);
-                        md_new_event(mddev);
                        goto unlock;
                }
                /* Set RUNNING before clearing NEEDED to avoid
@@ -7205,7 +7315,11 @@ void md_check_recovery(mddev_t *mddev)
                                        " thread...\n", 
                                        mdname(mddev));
                                /* leave the spares where they are, it shouldn't hurt */
-                                mddev->recovery = 0;
+                                clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+                                clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+                                clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+                                clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+                                clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
                        } else
                                md_wakeup_thread(mddev->sync_thread);
                        sysfs_notify_dirent_safe(mddev->sysfs_action);
@@ -7278,12 +7392,23 @@ static void md_geninit(void)
 static int __init md_init(void)
 {
-        if (register_blkdev(MD_MAJOR, "md"))
+        int ret = -ENOMEM;
-                return -1;
-        if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
+        md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
-                unregister_blkdev(MD_MAJOR, "md");
+        if (!md_wq)
-                return -1;
+                goto err_wq;
-        }
+        md_misc_wq = alloc_workqueue("md_misc", 0, 0);
+        if (!md_misc_wq)
+                goto err_misc_wq;
+        if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
+                goto err_md;
+        if ((ret = register_blkdev(0, "mdp")) < 0)
+                goto err_mdp;
+        mdp_major = ret;
        blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
                            md_probe, NULL, NULL);
        blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
@@ -7294,8 +7419,16 @@ static int __init md_init(void)
        md_geninit();
        return 0;
-}
+err_mdp:
+        unregister_blkdev(MD_MAJOR, "md");
+err_md:
+        destroy_workqueue(md_misc_wq);
+err_misc_wq:
+        destroy_workqueue(md_wq);
+err_wq:
+        return ret;
+}
 #ifndef MODULE
@@ -7382,6 +7515,8 @@ static __exit void md_exit(void)
                export_array(mddev);
                mddev->hold_active = 0;
        }
+        destroy_workqueue(md_misc_wq);
+        destroy_workqueue(md_wq);
 }
 subsys_initcall(md_init);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 3931299788dc..1c26c7a08ae6 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,26 +29,6 @@
 typedef struct mddev_s mddev_t;
 typedef struct mdk_rdev_s mdk_rdev_t;
-/* generic plugging support - like that provided with request_queue,
- * but does not require a request_queue
- */
-struct plug_handle {
-        void                    (*unplug_fn)(struct plug_handle *);
-        struct timer_list       unplug_timer;
-        struct work_struct      unplug_work;
-        unsigned long           unplug_flag;
-};
-#define PLUGGED_FLAG 1
-void plugger_init(struct plug_handle *plug,
-                  void (*unplug_fn)(struct plug_handle *));
-void plugger_set_plug(struct plug_handle *plug);
-int plugger_remove_plug(struct plug_handle *plug);
-static inline void plugger_flush(struct plug_handle *plug)
-{
-        del_timer_sync(&plug->unplug_timer);
-        cancel_work_sync(&plug->unplug_work);
-}
 /*
 * MD's 'extended' device
 */
@@ -60,6 +40,12 @@ struct mdk_rdev_s
        mddev_t *mddev;                 /* RAID array if running */
        int last_events;                /* IO event timestamp */
+        /*
+         * If meta_bdev is non-NULL, it means that a separate device is
+         * being used to store the metadata (superblock/bitmap) which
+         * would otherwise be contained on the same device as the data (bdev).
+         */
+        struct block_device *meta_bdev;
        struct block_device *bdev;      /* block device handle */
        struct page     *sb_page;
@@ -87,11 +73,8 @@ struct mdk_rdev_s
 #define Faulty          1               /* device is known to have a fault */
 #define In_sync         2               /* device is in_sync with rest of array */
 #define WriteMostly     4               /* Avoid reading if at all possible */
-#define BarriersNotsupp 5               /* REQ_HARDBARRIER is not supported */
-#define AllReserved     6               /* If whole device is reserved for
-                                         * one array */
 #define AutoDetected    7               /* added by auto-detect */
-#define Blocked         8               /* An error occured on an externally
+#define Blocked         8               /* An error occurred on an externally
                                         * managed array, don't allow writes
                                         * until it is cleared */
        wait_queue_head_t blocked_wait;
@@ -141,6 +124,7 @@ struct mddev_s
 #define MD_CHANGE_DEVS  0       /* Some device status has changed */
 #define MD_CHANGE_CLEAN 1       /* transition to or from 'clean' */
 #define MD_CHANGE_PENDING 2     /* switch from 'clean' to 'active' in progress */
+#define MD_ARRAY_FIRST_USE 3    /* First use of array, needs initialization */
        int                             suspended;
        atomic_t                        active_io;
@@ -149,7 +133,8 @@ struct mddev_s
                                                       * are happening, so run/
                                                       * takeover/stop are not safe
                                                       */
+        int                             ready; /* See when safe to pass 
+                                                * IO requests down */
        struct gendisk                  *gendisk;
        struct kobject                  kobj;
@@ -195,6 +180,9 @@ struct mddev_s
        int                             delta_disks, new_level, new_layout;
        int                             new_chunk_sectors;
+        atomic_t                        plug_cnt;       /* If device is expecting
+                                                         * more bios soon.
+                                                         */
        struct mdk_thread_s             *thread;        /* management thread */
        struct mdk_thread_s             *sync_thread;   /* doing resync or reconstruct */
        sector_t                        curr_resync;    /* last block scheduled */
@@ -270,16 +258,11 @@ struct mddev_s
        atomic_t                        active;         /* general refcount */
        atomic_t                        openers;        /* number of active opens */
+        int                             changed;        /* True if we might need to
+                                                         * reread partition info */
        int                             degraded;       /* whether md should consider
                                                         * adding a spare
                                                         */
-        int                             barriers_work;  /* initialised to true, cleared as soon
-                                                         * as a barrier request to slave
-                                                         * fails.  Only supported
-                                                         */
-        struct bio                      *biolist;       /* bios that need to be retried
-                                                         * because REQ_HARDBARRIER is not supported
-                                                         */
        atomic_t                        recovery_active; /* blocks scheduled, but not written */
        wait_queue_head_t               recovery_wait;
@@ -337,19 +320,18 @@ struct mddev_s
        struct list_head                all_mddevs;
        struct attribute_group          *to_remove;
-        struct plug_handle              *plug; /* if used by personality */
+        struct bio_set                  *bio_set;
-        /* Generic barrier handling.
-         * If there is a pending barrier request, all other
+        /* Generic flush handling.
-         * writes are blocked while the devices are flushed.
+         * The last to finish preflush schedules a worker to submit
-         * The last to finish a flush schedules a worker to
+         * the rest of the request (without the REQ_FLUSH flag).
-         * submit the barrier request (without the barrier flag),
-         * then submit more flush requests.
         */
-        struct bio *barrier;
+        struct bio *flush_bio;
        atomic_t flush_pending;
-        struct work_struct barrier_work;
+        struct work_struct flush_work;
        struct work_struct event_work;  /* used by dm to report failure event */
+        void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
 };
@@ -502,12 +484,12 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
 extern int mddev_congested(mddev_t *mddev, int bits);
-extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
+extern void md_flush_request(mddev_t *mddev, struct bio *bio);
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
                           sector_t sector, int size, struct page *page);
 extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, 
-                        struct page *page, int rw);
+                        struct page *page, int rw, bool metadata_op);
 extern void md_do_sync(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
 extern int md_allow_write(mddev_t *mddev);
@@ -518,7 +500,6 @@ extern int md_integrity_register(mddev_t *mddev);
 extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
 extern void restore_bitmap_write_access(struct file *file);
-extern void md_unplug(mddev_t *mddev);
 extern void mddev_init(mddev_t *mddev);
 extern int md_run(mddev_t *mddev);
@@ -528,4 +509,9 @@ extern void md_rdev_init(mdk_rdev_t *rdev);
 extern void mddev_suspend(mddev_t *mddev);
 extern void mddev_resume(mddev_t *mddev);
+extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
+                                   mddev_t *mddev);
+extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
+                                   mddev_t *mddev);
+extern int mddev_check_plugged(mddev_t *mddev);
 #endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 0307d217e7a4..3535c23af288 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -106,44 +106,14 @@ static void multipath_end_request(struct bio *bio, int error)
        rdev_dec_pending(rdev, conf->mddev);
 }
-static void unplug_slaves(mddev_t *mddev)
-{
-        multipath_conf_t *conf = mddev->private;
-        int i;
-        rcu_read_lock();
-        for (i=0; i<mddev->raid_disks; i++) {
-                mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
-                if (rdev && !test_bit(Faulty, &rdev->flags)
-                    && atomic_read(&rdev->nr_pending)) {
-                        struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-                        atomic_inc(&rdev->nr_pending);
-                        rcu_read_unlock();
-                        blk_unplug(r_queue);
-                        rdev_dec_pending(rdev, mddev);
-                        rcu_read_lock();
-                }
-        }
-        rcu_read_unlock();
-}
-static void multipath_unplug(struct request_queue *q)
-{
-        unplug_slaves(q->queuedata);
-}
 static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 {
        multipath_conf_t *conf = mddev->private;
        struct multipath_bh * mp_bh;
        struct multipath_info *multipath;
-        if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-                md_barrier_request(mddev, bio);
+                md_flush_request(mddev, bio);
                return 0;
        }
@@ -176,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
        int i;
        
        seq_printf (seq, " [%d/%d] [", conf->raid_disks,
-                                                 conf->working_disks);
+                    conf->raid_disks - mddev->degraded);
        for (i = 0; i < conf->raid_disks; i++)
                seq_printf (seq, "%s",
                               conf->multipaths[i].rdev && 
@@ -216,35 +186,36 @@ static int multipath_congested(void *data, int bits)
 static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
 {
        multipath_conf_t *conf = mddev->private;
+        char b[BDEVNAME_SIZE];
-        if (conf->working_disks <= 1) {
+        if (conf->raid_disks - mddev->degraded <= 1) {
                /*
                 * Uh oh, we can do nothing if this is our last path, but
                 * first check if this is a queued request for a device
                 * which has just failed.
                 */
                printk(KERN_ALERT 
-                        "multipath: only one IO path left and IO error.\n");
+                       "multipath: only one IO path left and IO error.\n");
                /* leave it active... it's all we have */
-        } else {
+                return;
-                /*
-                 * Mark disk as unusable
-                 */
-                if (!test_bit(Faulty, &rdev->flags)) {
-                        char b[BDEVNAME_SIZE];
-                        clear_bit(In_sync, &rdev->flags);
-                        set_bit(Faulty, &rdev->flags);
-                        set_bit(MD_CHANGE_DEVS, &mddev->flags);
-                        conf->working_disks--;
-                        mddev->degraded++;
-                        printk(KERN_ALERT "multipath: IO failure on %s,"
-                                " disabling IO path.\n"
-                                "multipath: Operation continuing"
-                                " on %d IO paths.\n",
-                                bdevname (rdev->bdev,b),
-                                conf->working_disks);
-                }
        }
+        /*
+         * Mark disk as unusable
+         */
+        if (test_and_clear_bit(In_sync, &rdev->flags)) {
+                unsigned long flags;
+                spin_lock_irqsave(&conf->device_lock, flags);
+                mddev->degraded++;
+                spin_unlock_irqrestore(&conf->device_lock, flags);
+        }
+        set_bit(Faulty, &rdev->flags);
+        set_bit(MD_CHANGE_DEVS, &mddev->flags);
+        printk(KERN_ALERT "multipath: IO failure on %s,"
+               " disabling IO path.\n"
+               "multipath: Operation continuing"
+               " on %d IO paths.\n",
+               bdevname(rdev->bdev, b),
+               conf->raid_disks - mddev->degraded);
 }
 static void print_multipath_conf (multipath_conf_t *conf)
@@ -257,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf)
                printk("(conf==NULL)\n");
                return;
        }
-        printk(" --- wd:%d rd:%d\n", conf->working_disks,
+        printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
                         conf->raid_disks);
        for (i = 0; i < conf->raid_disks; i++) {
@@ -304,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
                                                           PAGE_CACHE_SIZE - 1);
                        }
-                        conf->working_disks++;
+                        spin_lock_irq(&conf->device_lock);
                        mddev->degraded--;
                        rdev->raid_disk = path;
                        set_bit(In_sync, &rdev->flags);
+                        spin_unlock_irq(&conf->device_lock);
                        rcu_assign_pointer(p->rdev, rdev);
                        err = 0;
                        md_integrity_add_rdev(rdev, mddev);
@@ -345,7 +317,7 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
                        p->rdev = rdev;
                        goto abort;
                }
-                md_integrity_register(mddev);
+                err = md_integrity_register(mddev);
        }
 abort:
@@ -421,6 +393,7 @@ static int multipath_run (mddev_t *mddev)
        int disk_idx;
        struct multipath_info *disk;
        mdk_rdev_t *rdev;
+        int working_disks;
        if (md_check_no_bitmap(mddev))
                return -EINVAL;
@@ -435,7 +408,6 @@ static int multipath_run (mddev_t *mddev)
         * bookkeeping area. [whatever we allocate in multipath_run(),
         * should be freed in multipath_stop()]
         */
-        mddev->queue->queue_lock = &mddev->queue->__queue_lock;
        conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
        mddev->private = conf;
@@ -455,7 +427,7 @@ static int multipath_run (mddev_t *mddev)
                goto out_free_conf;
        }
-        conf->working_disks = 0;
+        working_disks = 0;
        list_for_each_entry(rdev, &mddev->disks, same_set) {
                disk_idx = rdev->raid_disk;
                if (disk_idx < 0 ||
@@ -477,7 +449,7 @@ static int multipath_run (mddev_t *mddev)
                }
                if (!test_bit(Faulty, &rdev->flags))
-                        conf->working_disks++;
+                        working_disks++;
        }
        conf->raid_disks = mddev->raid_disks;
@@ -485,12 +457,12 @@ static int multipath_run (mddev_t *mddev)
        spin_lock_init(&conf->device_lock);
        INIT_LIST_HEAD(&conf->retry_list);
-        if (!conf->working_disks) {
+        if (!working_disks) {
                printk(KERN_ERR "multipath: no operational IO paths for %s\n",
                        mdname(mddev));
                goto out_free_conf;
        }
-        mddev->degraded = conf->raid_disks - conf->working_disks;
+        mddev->degraded = conf->raid_disks - working_disks;
        conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
                                                 sizeof(struct multipath_bh));
@@ -512,16 +484,19 @@ static int multipath_run (mddev_t *mddev)
        printk(KERN_INFO 
                "multipath: array %s active with %d out of %d IO paths\n",
-                mdname(mddev), conf->working_disks, mddev->raid_disks);
+                mdname(mddev), conf->raid_disks - mddev->degraded,
+               mddev->raid_disks);
        /*
         * Ok, everything is just fine now
         */
        md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
-        mddev->queue->unplug_fn = multipath_unplug;
        mddev->queue->backing_dev_info.congested_fn = multipath_congested;
        mddev->queue->backing_dev_info.congested_data = mddev;
-        md_integrity_register(mddev);
+        if (md_integrity_register(mddev))
+                goto out_free_conf;
        return 0;
 out_free_conf:
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index d1c2a8d78395..3c5a45eb5f8a 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -9,7 +9,6 @@ struct multipath_private_data {
        mddev_t                 *mddev;
        struct multipath_info   *multipaths;
        int                     raid_disks;
-        int                     working_disks;
        spinlock_t              device_lock;
        struct list_head        retry_list;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6f7af46d623c..e86bf3682e1e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,21 +25,6 @@
 #include "raid0.h"
 #include "raid5.h"
-static void raid0_unplug(struct request_queue *q)
-{
-        mddev_t *mddev = q->queuedata;
-        raid0_conf_t *conf = mddev->private;
-        mdk_rdev_t **devlist = conf->devlist;
-        int raid_disks = conf->strip_zone[0].nb_dev;
-        int i;
-        for (i=0; i < raid_disks; i++) {
-                struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);
-                blk_unplug(r_queue);
-        }
-}
 static int raid0_congested(void *data, int bits)
 {
        mddev_t *mddev = data;
@@ -179,6 +164,14 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
                        rdev1->new_raid_disk = j;
                }
+                if (mddev->level == 1) {
+                        /* taiking over a raid1 array-
+                         * we have only one active disk
+                         */
+                        j = 0;
+                        rdev1->new_raid_disk = j;
+                }
                if (j < 0 || j >= mddev->raid_disks) {
                        printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
                               "aborting!\n", mdname(mddev), j);
@@ -264,7 +257,6 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
                       mdname(mddev),
                       (unsigned long long)smallest->sectors);
        }
-        mddev->queue->unplug_fn = raid0_unplug;
        mddev->queue->backing_dev_info.congested_fn = raid0_congested;
        mddev->queue->backing_dev_info.congested_data = mddev;
@@ -353,7 +345,6 @@ static int raid0_run(mddev_t *mddev)
        if (md_check_no_bitmap(mddev))
                return -EINVAL;
        blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
-        mddev->queue->queue_lock = &mddev->queue->__queue_lock;
        /* if private is not null, we are here after takeover */
        if (mddev->private == NULL) {
@@ -388,8 +379,7 @@ static int raid0_run(mddev_t *mddev)
        blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
        dump_zones(mddev);
-        md_integrity_register(mddev);
+        return md_integrity_register(mddev);
-        return 0;
 }
 static int raid0_stop(mddev_t *mddev)
@@ -483,8 +473,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
        struct strip_zone *zone;
        mdk_rdev_t *tmp_dev;
-        if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-                md_barrier_request(mddev, bio);
+                md_flush_request(mddev, bio);
                return 0;
        }
@@ -644,12 +634,39 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
        return priv_conf;
 }
+static void *raid0_takeover_raid1(mddev_t *mddev)
+{
+        raid0_conf_t *priv_conf;
+        /* Check layout:
+         *  - (N - 1) mirror drives must be already faulty
+         */
+        if ((mddev->raid_disks - 1) != mddev->degraded) {
+                printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
+                       mdname(mddev));
+                return ERR_PTR(-EINVAL);
+        }
+        /* Set new parameters */
+        mddev->new_level = 0;
+        mddev->new_layout = 0;
+        mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */
+        mddev->delta_disks = 1 - mddev->raid_disks;
+        mddev->raid_disks = 1;
+        /* make sure it will be not marked as dirty */
+        mddev->recovery_cp = MaxSector;
+        create_strip_zones(mddev, &priv_conf);
+        return priv_conf;
+}
 static void *raid0_takeover(mddev_t *mddev)
 {
        /* raid0 can take over:
         *  raid4 - if all data disks are active.
         *  raid5 - providing it is Raid4 layout and one disk is faulty
         *  raid10 - assuming we have all necessary active disks
+         *  raid1 - with (N -1) mirror drives faulty
         */
        if (mddev->level == 4)
                return raid0_takeover_raid45(mddev);
@@ -665,6 +682,12 @@ static void *raid0_takeover(mddev_t *mddev)
        if (mddev->level == 10)
                return raid0_takeover_raid10(mddev);
+        if (mddev->level == 1)
+                return raid0_takeover_raid1(mddev);
+        printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n",
+                mddev->level);
        return ERR_PTR(-EINVAL);
 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0b830bbe1d8b..f7431b6d8447 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -52,23 +52,16 @@
 #define NR_RAID1_BIOS 256
-static void unplug_slaves(mddev_t *mddev);
 static void allow_barrier(conf_t *conf);
 static void lower_barrier(conf_t *conf);
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
        struct pool_info *pi = data;
-        r1bio_t *r1_bio;
        int size = offsetof(r1bio_t, bios[pi->raid_disks]);
        /* allocate a r1bio with room for raid_disks entries in the bios array */
-        r1_bio = kzalloc(size, gfp_flags);
+        return kzalloc(size, gfp_flags);
-        if (!r1_bio && pi->mddev)
-                unplug_slaves(pi->mddev);
-        return r1_bio;
 }
 static void r1bio_pool_free(void *r1_bio, void *data)
@@ -91,16 +84,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
        int i, j;
        r1_bio = r1bio_pool_alloc(gfp_flags, pi);
-        if (!r1_bio) {
+        if (!r1_bio)
-                unplug_slaves(pi->mddev);
                return NULL;
-        }
        /*
         * Allocate bios : 1 for reading, n-1 for writing
         */
        for (j = pi->raid_disks ; j-- ; ) {
-                bio = bio_alloc(gfp_flags, RESYNC_PAGES);
+                bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
                if (!bio)
                        goto out_free_bio;
                r1_bio->bios[j] = bio;
@@ -306,6 +297,29 @@ static void raid1_end_read_request(struct bio *bio, int error)
        rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
 }
+static void r1_bio_write_done(r1bio_t *r1_bio)
+{
+        if (atomic_dec_and_test(&r1_bio->remaining))
+        {
+                /* it really is the end of this request */
+                if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+                        /* free extra copy of the data pages */
+                        int i = r1_bio->behind_page_count;
+                        while (i--)
+                                safe_put_page(r1_bio->behind_pages[i]);
+                        kfree(r1_bio->behind_pages);
+                        r1_bio->behind_pages = NULL;
+                }
+                /* clear the bitmap if all writes complete successfully */
+                bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+                                r1_bio->sectors,
+                                !test_bit(R1BIO_Degraded, &r1_bio->state),
+                                test_bit(R1BIO_BehindIO, &r1_bio->state));
+                md_write_end(r1_bio->mddev);
+                raid_end_bio_io(r1_bio);
+        }
+}
 static void raid1_end_write_request(struct bio *bio, int error)
 {
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -319,84 +333,61 @@ static void raid1_end_write_request(struct bio *bio, int error)
                if (r1_bio->bios[mirror] == bio)
                        break;
-        if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
+        /*
-                set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
+         * 'one mirror IO has finished' event handler:
-                set_bit(R1BIO_BarrierRetry, &r1_bio->state);
+         */
-                r1_bio->mddev->barriers_work = 0;
+        r1_bio->bios[mirror] = NULL;
-                /* Don't rdev_dec_pending in this branch - keep it for the retry */
+        to_put = bio;
-        } else {
+        if (!uptodate) {
+                md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+                /* an I/O failed, we can't clear the bitmap */
+                set_bit(R1BIO_Degraded, &r1_bio->state);
+        } else
                /*
-                 * this branch is our 'one mirror IO has finished' event handler:
+                 * Set R1BIO_Uptodate in our master bio, so that we
+                 * will return a good error code for to the higher
+                 * levels even if IO on some other mirrored buffer
+                 * fails.
+                 *
+                 * The 'master' represents the composite IO operation
+                 * to user-side. So if something waits for IO, then it
+                 * will wait for the 'master' bio.
                 */
-                r1_bio->bios[mirror] = NULL;
+                set_bit(R1BIO_Uptodate, &r1_bio->state);
-                to_put = bio;
-                if (!uptodate) {
+        update_head_pos(mirror, r1_bio);
-                        md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
-                        /* an I/O failed, we can't clear the bitmap */
+        if (behind) {
-                        set_bit(R1BIO_Degraded, &r1_bio->state);
+                if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
-                } else
+                        atomic_dec(&r1_bio->behind_remaining);
-                        /*
-                         * Set R1BIO_Uptodate in our master bio, so that
+                /*
-                         * we will return a good error code for to the higher
+                 * In behind mode, we ACK the master bio once the I/O
-                         * levels even if IO on some other mirrored buffer fails.
+                 * has safely reached all non-writemostly
-                         *
+                 * disks. Setting the Returned bit ensures that this
-                         * The 'master' represents the composite IO operation to
+                 * gets done only once -- we don't ever want to return
-                         * user-side. So if something waits for IO, then it will
+                 * -EIO here, instead we'll wait
-                         * wait for the 'master' bio.
+                 */
-                         */
+                if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
-                        set_bit(R1BIO_Uptodate, &r1_bio->state);
+                    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+                        /* Maybe we can return now */
-                update_head_pos(mirror, r1_bio);
+                        if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+                                struct bio *mbio = r1_bio->master_bio;
-                if (behind) {
+                                PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
-                        if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+                                       (unsigned long long) mbio->bi_sector,
-                                atomic_dec(&r1_bio->behind_remaining);
+                                       (unsigned long long) mbio->bi_sector +
+                                       (mbio->bi_size >> 9) - 1);
-                        /* In behind mode, we ACK the master bio once the I/O has safely
+                                bio_endio(mbio, 0);
-                         * reached all non-writemostly disks. Setting the Returned bit
-                         * ensures that this gets done only once -- we don't ever want to
-                         * return -EIO here, instead we'll wait */
-                        if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
-                            test_bit(R1BIO_Uptodate, &r1_bio->state)) {
-                                /* Maybe we can return now */
-                                if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
-                                        struct bio *mbio = r1_bio->master_bio;
-                                        PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
-                                               (unsigned long long) mbio->bi_sector,
-                                               (unsigned long long) mbio->bi_sector +
-                                               (mbio->bi_size >> 9) - 1);
-                                        bio_endio(mbio, 0);
-                                }
                        }
                }
-                rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
        }
+        rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
        /*
-         *
         * Let's see if all mirrored write operations have finished
         * already.
         */
-        if (atomic_dec_and_test(&r1_bio->remaining)) {
+        r1_bio_write_done(r1_bio);
-                if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
-                        reschedule_retry(r1_bio);
-                else {
-                        /* it really is the end of this request */
-                        if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-                                /* free extra copy of the data pages */
-                                int i = bio->bi_vcnt;
-                                while (i--)
-                                        safe_put_page(bio->bi_io_vec[i].bv_page);
-                        }
-                        /* clear the bitmap if all writes complete successfully */
-                        bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
-                                        r1_bio->sectors,
-                                        !test_bit(R1BIO_Degraded, &r1_bio->state),
-                                        behind);
-                        md_write_end(r1_bio->mddev);
-                        raid_end_bio_io(r1_bio);
-                }
-        }
        if (to_put)
                bio_put(to_put);
@@ -420,11 +411,13 @@ static void raid1_end_write_request(struct bio *bio, int error)
 static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 {
        const sector_t this_sector = r1_bio->sector;
-        int new_disk = conf->last_used, disk = new_disk;
-        int wonly_disk = -1;
        const int sectors = r1_bio->sectors;
-        sector_t new_distance, current_distance;
+        int start_disk;
+        int best_disk;
+        int i;
+        sector_t best_dist;
        mdk_rdev_t *rdev;
+        int choose_first;
        rcu_read_lock();
        /*
@@ -433,100 +426,63 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
         * We take the first readable disk when above the resync window.
         */
 retry:
+        best_disk = -1;
+        best_dist = MaxSector;
        if (conf->mddev->recovery_cp < MaxSector &&
            (this_sector + sectors >= conf->next_resync)) {
-                /* Choose the first operational device, for consistancy */
+                choose_first = 1;
-                new_disk = 0;
+                start_disk = 0;
+        } else {
-                for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
+                choose_first = 0;
-                     r1_bio->bios[new_disk] == IO_BLOCKED ||
+                start_disk = conf->last_used;
-                     !rdev || !test_bit(In_sync, &rdev->flags)
-                             || test_bit(WriteMostly, &rdev->flags);
-                     rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
-                        if (rdev && test_bit(In_sync, &rdev->flags) &&
-                                r1_bio->bios[new_disk] != IO_BLOCKED)
-                                wonly_disk = new_disk;
-                        if (new_disk == conf->raid_disks - 1) {
-                                new_disk = wonly_disk;
-                                break;
-                        }
-                }
-                goto rb_out;
-        }
-        /* make sure the disk is operational */
-        for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
-             r1_bio->bios[new_disk] == IO_BLOCKED ||
-             !rdev || !test_bit(In_sync, &rdev->flags) ||
-                     test_bit(WriteMostly, &rdev->flags);
-             rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
-                if (rdev && test_bit(In_sync, &rdev->flags) &&
-                    r1_bio->bios[new_disk] != IO_BLOCKED)
-                        wonly_disk = new_disk;
-                if (new_disk <= 0)
-                        new_disk = conf->raid_disks;
-                new_disk--;
-                if (new_disk == disk) {
-                        new_disk = wonly_disk;
-                        break;
-                }
        }
-        if (new_disk < 0)
+        for (i = 0 ; i < conf->raid_disks ; i++) {
-                goto rb_out;
+                sector_t dist;
+                int disk = start_disk + i;
-        disk = new_disk;
+                if (disk >= conf->raid_disks)
-        /* now disk == new_disk == starting point for search */
+                        disk -= conf->raid_disks;
-        /*
-         * Don't change to another disk for sequential reads:
-         */
-        if (conf->next_seq_sect == this_sector)
-                goto rb_out;
-        if (this_sector == conf->mirrors[new_disk].head_position)
-                goto rb_out;
-        current_distance = abs(this_sector - conf->mirrors[disk].head_position);
-        /* Find the disk whose head is closest */
-        do {
-                if (disk <= 0)
-                        disk = conf->raid_disks;
-                disk--;
                rdev = rcu_dereference(conf->mirrors[disk].rdev);
+                if (r1_bio->bios[disk] == IO_BLOCKED
-                if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
+                    || rdev == NULL
-                    !test_bit(In_sync, &rdev->flags) ||
+                    || test_bit(Faulty, &rdev->flags))
-                    test_bit(WriteMostly, &rdev->flags))
                        continue;
+                if (!test_bit(In_sync, &rdev->flags) &&
-                if (!atomic_read(&rdev->nr_pending)) {
+                    rdev->recovery_offset < this_sector + sectors)
-                        new_disk = disk;
+                        continue;
+                if (test_bit(WriteMostly, &rdev->flags)) {
+                        /* Don't balance among write-mostly, just
+                         * use the first as a last resort */
+                        if (best_disk < 0)
+                                best_disk = disk;
+                        continue;
+                }
+                /* This is a reasonable device to use.  It might
+                 * even be best.
+                 */
+                dist = abs(this_sector - conf->mirrors[disk].head_position);
+                if (choose_first
+                    /* Don't change to another disk for sequential reads */
+                    || conf->next_seq_sect == this_sector
+                    || dist == 0
+                    /* If device is idle, use it */
+                    || atomic_read(&rdev->nr_pending) == 0) {
+                        best_disk = disk;
                        break;
                }
-                new_distance = abs(this_sector - conf->mirrors[disk].head_position);
+                if (dist < best_dist) {
-                if (new_distance < current_distance) {
+                        best_dist = dist;
-                        current_distance = new_distance;
+                        best_disk = disk;
-                        new_disk = disk;
                }
-        } while (disk != conf->last_used);
+        }
- rb_out:
-        if (new_disk >= 0) {
+        if (best_disk >= 0) {
-                rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
+                rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
                if (!rdev)
                        goto retry;
                atomic_inc(&rdev->nr_pending);
-                if (!test_bit(In_sync, &rdev->flags)) {
+                if (test_bit(Faulty, &rdev->flags)) {
                        /* cannot risk returning a device that failed
                         * before we inc'ed nr_pending
                         */
@@ -534,59 +490,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
                        goto retry;
                }
                conf->next_seq_sect = this_sector + sectors;
-                conf->last_used = new_disk;
+                conf->last_used = best_disk;
        }
        rcu_read_unlock();
-        return new_disk;
+        return best_disk;
 }
-static void unplug_slaves(mddev_t *mddev)
+int md_raid1_congested(mddev_t *mddev, int bits)
 {
        conf_t *conf = mddev->private;
-        int i;
-        rcu_read_lock();
-        for (i=0; i<mddev->raid_disks; i++) {
-                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-                if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-                        struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-                        atomic_inc(&rdev->nr_pending);
-                        rcu_read_unlock();
-                        blk_unplug(r_queue);
-                        rdev_dec_pending(rdev, mddev);
-                        rcu_read_lock();
-                }
-        }
-        rcu_read_unlock();
-}
-static void raid1_unplug(struct request_queue *q)
-{
-        mddev_t *mddev = q->queuedata;
-        unplug_slaves(mddev);
-        md_wakeup_thread(mddev->thread);
-}
-static int raid1_congested(void *data, int bits)
-{
-        mddev_t *mddev = data;
-        conf_t *conf = mddev->private;
        int i, ret = 0;
-        if (mddev_congested(mddev, bits))
-                return 1;
        rcu_read_lock();
        for (i = 0; i < mddev->raid_disks; i++) {
                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
                if (rdev && !test_bit(Faulty, &rdev->flags)) {
                        struct request_queue *q = bdev_get_queue(rdev->bdev);
+                        BUG_ON(!q);
                        /* Note the '|| 1' - when read_balance prefers
                         * non-congested targets, it can be removed
                         */
@@ -599,22 +522,26 @@ static int raid1_congested(void *data, int bits)
        rcu_read_unlock();
        return ret;
 }
+EXPORT_SYMBOL_GPL(md_raid1_congested);
+static int raid1_congested(void *data, int bits)
+{
+        mddev_t *mddev = data;
-static int flush_pending_writes(conf_t *conf)
+        return mddev_congested(mddev, bits) ||
+                md_raid1_congested(mddev, bits);
+}
+static void flush_pending_writes(conf_t *conf)
 {
        /* Any writes that have been queued but are awaiting
         * bitmap updates get flushed here.
-         * We return 1 if any requests were actually submitted.
         */
-        int rv = 0;
        spin_lock_irq(&conf->device_lock);
        if (conf->pending_bio_list.head) {
                struct bio *bio;
                bio = bio_list_get(&conf->pending_bio_list);
-                blk_remove_plug(conf->mddev->queue);
                spin_unlock_irq(&conf->device_lock);
                /* flush any pending bitmap writes to
                 * disk before proceeding w/ I/O */
@@ -626,10 +553,8 @@ static int flush_pending_writes(conf_t *conf)
                        generic_make_request(bio);
                        bio = next;
                }
-                rv = 1;
        } else
                spin_unlock_irq(&conf->device_lock);
-        return rv;
 }
 /* Barriers....
@@ -661,17 +586,15 @@ static void raise_barrier(conf_t *conf)
        /* Wait until no block IO is waiting */
        wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
-                            conf->resync_lock,
+                            conf->resync_lock, );
-                            raid1_unplug(conf->mddev->queue));
        /* block any new IO from starting */
        conf->barrier++;
-        /* No wait for all pending IO to complete */
+        /* Now wait for all pending IO to complete */
        wait_event_lock_irq(conf->wait_barrier,
                            !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-                            conf->resync_lock,
+                            conf->resync_lock, );
-                            raid1_unplug(conf->mddev->queue));
        spin_unlock_irq(&conf->resync_lock);
 }
@@ -693,7 +616,7 @@ static void wait_barrier(conf_t *conf)
                conf->nr_waiting++;
                wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
                                    conf->resync_lock,
-                                    raid1_unplug(conf->mddev->queue));
+                                    );
                conf->nr_waiting--;
        }
        conf->nr_pending++;
@@ -729,8 +652,7 @@ static void freeze_array(conf_t *conf)
        wait_event_lock_irq(conf->wait_barrier,
                            conf->nr_pending == conf->nr_queued+1,
                            conf->resync_lock,
-                            ({ flush_pending_writes(conf);
+                            flush_pending_writes(conf));
-                               raid1_unplug(conf->mddev->queue); }));
        spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(conf_t *conf)
@@ -744,15 +666,16 @@ static void unfreeze_array(conf_t *conf)
 }
-/* duplicate the data pages for behind I/O */
+/* duplicate the data pages for behind I/O 
-static struct page **alloc_behind_pages(struct bio *bio)
+ */
+static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
 {
        int i;
        struct bio_vec *bvec;
-        struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
+        struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
                                        GFP_NOIO);
        if (unlikely(!pages))
-                goto do_sync_io;
+                return;
        bio_for_each_segment(bvec, bio, i) {
                pages[i] = alloc_page(GFP_NOIO);
@@ -763,16 +686,17 @@ static struct page **alloc_behind_pages(struct bio *bio)
                kunmap(pages[i]);
                kunmap(bvec->bv_page);
        }
+        r1_bio->behind_pages = pages;
-        return pages;
+        r1_bio->behind_page_count = bio->bi_vcnt;
+        set_bit(R1BIO_BehindIO, &r1_bio->state);
+        return;
 do_sync_io:
-        if (pages)
+        for (i = 0; i < bio->bi_vcnt; i++)
-                for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
+                if (pages[i])
                        put_page(pages[i]);
        kfree(pages);
        PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
-        return NULL;
 }
 static int make_request(mddev_t *mddev, struct bio * bio)
@@ -784,20 +708,16 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        int i, targets = 0, disks;
        struct bitmap *bitmap;
        unsigned long flags;
-        struct bio_list bl;
-        struct page **behind_pages = NULL;
        const int rw = bio_data_dir(bio);
        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
-        unsigned long do_barriers;
+        const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
        mdk_rdev_t *blocked_rdev;
+        int plugged;
        /*
         * Register the new request and wait if the reconstruction
         * thread has put up a bar for new requests.
         * Continue immediately if no resync is active currently.
-         * We test barriers_work *after* md_write_start as md_write_start
-         * may cause the first superblock write, and that will check out
-         * if barriers work.
         */
        md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +741,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                }
                finish_wait(&conf->wait_barrier, &w);
        }
-        if (unlikely(!mddev->barriers_work &&
-                     (bio->bi_rw & REQ_HARDBARRIER))) {
-                if (rw == WRITE)
-                        md_write_end(mddev);
-                bio_endio(bio, -EOPNOTSUPP);
-                return 0;
-        }
        wait_barrier(conf);
@@ -870,7 +783,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                }
                r1_bio->read_disk = rdisk;
-                read_bio = bio_clone(bio, GFP_NOIO);
+                read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
                r1_bio->bios[rdisk] = read_bio;
@@ -891,14 +804,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
         * inc refcount on their rdev.  Record them by setting
         * bios[x] to bio
         */
+        plugged = mddev_check_plugged(mddev);
        disks = conf->raid_disks;
-#if 0
-        { static int first=1;
-        if (first) printk("First Write sector %llu disks %d\n",
-                          (unsigned long long)r1_bio->sector, disks);
-        first = 0;
-        }
-#endif
 retry_write:
        blocked_rdev = NULL;
        rcu_read_lock();
@@ -952,33 +860,29 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        if (bitmap &&
            (atomic_read(&bitmap->behind_writes)
             < mddev->bitmap_info.max_write_behind) &&
-            !waitqueue_active(&bitmap->behind_wait) &&
+            !waitqueue_active(&bitmap->behind_wait))
-            (behind_pages = alloc_behind_pages(bio)) != NULL)
+                alloc_behind_pages(bio, r1_bio);
-                set_bit(R1BIO_BehindIO, &r1_bio->state);
-        atomic_set(&r1_bio->remaining, 0);
+        atomic_set(&r1_bio->remaining, 1);
        atomic_set(&r1_bio->behind_remaining, 0);
-        do_barriers = bio->bi_rw & REQ_HARDBARRIER;
+        bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
-        if (do_barriers)
+                                test_bit(R1BIO_BehindIO, &r1_bio->state));
-                set_bit(R1BIO_Barrier, &r1_bio->state);
-        bio_list_init(&bl);
        for (i = 0; i < disks; i++) {
                struct bio *mbio;
                if (!r1_bio->bios[i])
                        continue;
-                mbio = bio_clone(bio, GFP_NOIO);
+                mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
                r1_bio->bios[i] = mbio;
                mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
                mbio->bi_end_io = raid1_end_write_request;
-                mbio->bi_rw = WRITE | do_barriers | do_sync;
+                mbio->bi_rw = WRITE | do_flush_fua | do_sync;
                mbio->bi_private = r1_bio;
-                if (behind_pages) {
+                if (r1_bio->behind_pages) {
                        struct bio_vec *bvec;
                        int j;
@@ -986,39 +890,27 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                         * we clear any unused pointer in the io_vec, rather
                         * than leave them unchanged.  This is important
                         * because when we come to free the pages, we won't
-                         * know the originial bi_idx, so we just free
+                         * know the original bi_idx, so we just free
                         * them all
                         */
                        __bio_for_each_segment(bvec, mbio, j, 0)
-                                bvec->bv_page = behind_pages[j];
+                                bvec->bv_page = r1_bio->behind_pages[j];
                        if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
                                atomic_inc(&r1_bio->behind_remaining);
                }
                atomic_inc(&r1_bio->remaining);
+                spin_lock_irqsave(&conf->device_lock, flags);
-                bio_list_add(&bl, mbio);
+                bio_list_add(&conf->pending_bio_list, mbio);
+                spin_unlock_irqrestore(&conf->device_lock, flags);
        }
-        kfree(behind_pages); /* the behind pages are attached to the bios now */
+        r1_bio_write_done(r1_bio);
-        bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
-                                test_bit(R1BIO_BehindIO, &r1_bio->state));
-        spin_lock_irqsave(&conf->device_lock, flags);
-        bio_list_merge(&conf->pending_bio_list, &bl);
-        bio_list_init(&bl);
-        blk_plug_device(mddev->queue);
+        /* In case raid1d snuck in to freeze_array */
-        spin_unlock_irqrestore(&conf->device_lock, flags);
-        /* In case raid1d snuck into freeze_array */
        wake_up(&conf->wait_barrier);
-        if (do_sync)
+        if (do_sync || !bitmap || !plugged)
                md_wakeup_thread(mddev->thread);
-#if 0
-        while ((bio = bio_list_pop(&bl)) != NULL)
-                generic_make_request(bio);
-#endif
        return 0;
 }
@@ -1076,8 +968,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
        } else
                set_bit(Faulty, &rdev->flags);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
-        printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n"
+        printk(KERN_ALERT
-               KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n",
+               "md/raid1:%s: Disk failure on %s, disabling device.\n"
+               "md/raid1:%s: Operation continuing on %d devices.\n",
               mdname(mddev), bdevname(rdev->bdev, b),
               mdname(mddev), conf->raid_disks - mddev->degraded);
 }
@@ -1206,10 +1099,11 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
                        err = -EBUSY;
                        goto abort;
                }
-                /* Only remove non-faulty devices is recovery
+                /* Only remove non-faulty devices if recovery
                 * is not possible.
                 */
                if (!test_bit(Faulty, &rdev->flags) &&
+                    !mddev->recovery_disabled &&
                    mddev->degraded < conf->raid_disks) {
                        err = -EBUSY;
                        goto abort;
@@ -1222,7 +1116,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
                        p->rdev = rdev;
                        goto abort;
                }
-                md_integrity_register(mddev);
+                err = md_integrity_register(mddev);
        }
 abort:
@@ -1268,7 +1162,7 @@ static void end_sync_write(struct bio *bio, int error)
                        break;
                }
        if (!uptodate) {
-                int sync_blocks = 0;
+                sector_t sync_blocks = 0;
                sector_t s = r1_bio->sector;
                long sectors_to_go = r1_bio->sectors;
                /* make sure these bits doesn't get cleared. */
@@ -1290,194 +1184,210 @@ static void end_sync_write(struct bio *bio, int error)
        }
 }
-static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
+static int fix_sync_read_error(r1bio_t *r1_bio)
 {
+        /* Try some synchronous reads of other devices to get
+         * good data, much like with normal read errors.  Only
+         * read into the pages we already have so we don't
+         * need to re-issue the read request.
+         * We don't need to freeze the array, because being in an
+         * active sync request, there is no normal IO, and
+         * no overlapping syncs.
+         */
+        mddev_t *mddev = r1_bio->mddev;
        conf_t *conf = mddev->private;
-        int i;
+        struct bio *bio = r1_bio->bios[r1_bio->read_disk];
-        int disks = conf->raid_disks;
+        sector_t sect = r1_bio->sector;
-        struct bio *bio, *wbio;
+        int sectors = r1_bio->sectors;
+        int idx = 0;
-        bio = r1_bio->bios[r1_bio->read_disk];
+        while(sectors) {
+                int s = sectors;
+                int d = r1_bio->read_disk;
+                int success = 0;
+                mdk_rdev_t *rdev;
+                int start;
-        if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+                if (s > (PAGE_SIZE>>9))
-                /* We have read all readable devices.  If we haven't
+                        s = PAGE_SIZE >> 9;
-                 * got the block, then there is no hope left.
+                do {
-                 * If we have, then we want to do a comparison
+                        if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
-                 * and skip the write if everything is the same.
+                                /* No rcu protection needed here devices
-                 * If any blocks failed to read, then we need to
+                                 * can only be removed when no resync is
-                 * attempt an over-write
+                                 * active, and resync is currently active
-                 */
+                                 */
-                int primary;
+                                rdev = conf->mirrors[d].rdev;
-                if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+                                if (sync_page_io(rdev,
-                        for (i=0; i<mddev->raid_disks; i++)
+                                                 sect,
-                                if (r1_bio->bios[i]->bi_end_io == end_sync_read)
+                                                 s<<9,
-                                        md_error(mddev, conf->mirrors[i].rdev);
+                                                 bio->bi_io_vec[idx].bv_page,
+                                                 READ, false)) {
+                                        success = 1;
+                                        break;
+                                }
+                        }
+                        d++;
+                        if (d == conf->raid_disks)
+                                d = 0;
+                } while (!success && d != r1_bio->read_disk);
-                        md_done_sync(mddev, r1_bio->sectors, 1);
+                if (!success) {
+                        char b[BDEVNAME_SIZE];
+                        /* Cannot read from anywhere, array is toast */
+                        md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+                        printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
+                               " for block %llu\n",
+                               mdname(mddev),
+                               bdevname(bio->bi_bdev, b),
+                               (unsigned long long)r1_bio->sector);
+                        md_done_sync(mddev, r1_bio->sectors, 0);
                        put_buf(r1_bio);
-                        return;
+                        return 0;
                }
-                for (primary=0; primary<mddev->raid_disks; primary++)
-                        if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
-                            test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
-                                r1_bio->bios[primary]->bi_end_io = NULL;
-                                rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
-                                break;
-                        }
-                r1_bio->read_disk = primary;
-                for (i=0; i<mddev->raid_disks; i++)
-                        if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
-                                int j;
-                                int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
-                                struct bio *pbio = r1_bio->bios[primary];
-                                struct bio *sbio = r1_bio->bios[i];
-                                if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
-                                        for (j = vcnt; j-- ; ) {
-                                                struct page *p, *s;
-                                                p = pbio->bi_io_vec[j].bv_page;
-                                                s = sbio->bi_io_vec[j].bv_page;
-                                                if (memcmp(page_address(p),
-                                                           page_address(s),
-                                                           PAGE_SIZE))
-                                                        break;
-                                        }
-                                } else
-                                        j = 0;
-                                if (j >= 0)
-                                        mddev->resync_mismatches += r1_bio->sectors;
-                                if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
-                                              && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
-                                        sbio->bi_end_io = NULL;
-                                        rdev_dec_pending(conf->mirrors[i].rdev, mddev);
-                                } else {
-                                        /* fixup the bio for reuse */
-                                        int size;
-                                        sbio->bi_vcnt = vcnt;
-                                        sbio->bi_size = r1_bio->sectors << 9;
-                                        sbio->bi_idx = 0;
-                                        sbio->bi_phys_segments = 0;
-                                        sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-                                        sbio->bi_flags |= 1 << BIO_UPTODATE;
-                                        sbio->bi_next = NULL;
-                                        sbio->bi_sector = r1_bio->sector +
-                                                conf->mirrors[i].rdev->data_offset;
-                                        sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
-                                        size = sbio->bi_size;
-                                        for (j = 0; j < vcnt ; j++) {
-                                                struct bio_vec *bi;
-                                                bi = &sbio->bi_io_vec[j];
-                                                bi->bv_offset = 0;
-                                                if (size > PAGE_SIZE)
-                                                        bi->bv_len = PAGE_SIZE;
-                                                else
-                                                        bi->bv_len = size;
-                                                size -= PAGE_SIZE;
-                                                memcpy(page_address(bi->bv_page),
-                                                       page_address(pbio->bi_io_vec[j].bv_page),
-                                                       PAGE_SIZE);
-                                        }
-                                }
+                start = d;
-                        }
+                /* write it back and re-read */
+                while (d != r1_bio->read_disk) {
+                        if (d == 0)
+                                d = conf->raid_disks;
+                        d--;
+                        if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+                                continue;
+                        rdev = conf->mirrors[d].rdev;
+                        if (sync_page_io(rdev,
+                                         sect,
+                                         s<<9,
+                                         bio->bi_io_vec[idx].bv_page,
+                                         WRITE, false) == 0) {
+                                r1_bio->bios[d]->bi_end_io = NULL;
+                                rdev_dec_pending(rdev, mddev);
+                                md_error(mddev, rdev);
+                        } else
+                                atomic_add(s, &rdev->corrected_errors);
+                }
+                d = start;
+                while (d != r1_bio->read_disk) {
+                        if (d == 0)
+                                d = conf->raid_disks;
+                        d--;
+                        if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+                                continue;
+                        rdev = conf->mirrors[d].rdev;
+                        if (sync_page_io(rdev,
+                                         sect,
+                                         s<<9,
+                                         bio->bi_io_vec[idx].bv_page,
+                                         READ, false) == 0)
+                                md_error(mddev, rdev);
+                }
+                sectors -= s;
+                sect += s;
+                idx ++;
        }
-        if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+        set_bit(R1BIO_Uptodate, &r1_bio->state);
-                /* ouch - failed to read all of that.
+        set_bit(BIO_UPTODATE, &bio->bi_flags);
-                 * Try some synchronous reads of other devices to get
+        return 1;
-                 * good data, much like with normal read errors.  Only
+}
-                 * read into the pages we already have so we don't
-                 * need to re-issue the read request.
+static int process_checks(r1bio_t *r1_bio)
-                 * We don't need to freeze the array, because being in an
+{
-                 * active sync request, there is no normal IO, and
+        /* We have read all readable devices.  If we haven't
-                 * no overlapping syncs.
+         * got the block, then there is no hope left.
-                 */
+         * If we have, then we want to do a comparison
-                sector_t sect = r1_bio->sector;
+         * and skip the write if everything is the same.
-                int sectors = r1_bio->sectors;
+         * If any blocks failed to read, then we need to
-                int idx = 0;
+         * attempt an over-write
+         */
-                while(sectors) {
+        mddev_t *mddev = r1_bio->mddev;
-                        int s = sectors;
+        conf_t *conf = mddev->private;
-                        int d = r1_bio->read_disk;
+        int primary;
-                        int success = 0;
+        int i;
-                        mdk_rdev_t *rdev;
+        for (primary = 0; primary < conf->raid_disks; primary++)
-                        if (s > (PAGE_SIZE>>9))
+                if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
-                                s = PAGE_SIZE >> 9;
+                    test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
-                        do {
+                        r1_bio->bios[primary]->bi_end_io = NULL;
-                                if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
+                        rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
-                                        /* No rcu protection needed here devices
+                        break;
-                                         * can only be removed when no resync is
+                }
-                                         * active, and resync is currently active
+        r1_bio->read_disk = primary;
-                                         */
+        for (i = 0; i < conf->raid_disks; i++) {
-                                        rdev = conf->mirrors[d].rdev;
+                int j;
-                                        if (sync_page_io(rdev->bdev,
+                int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
-                                                         sect + rdev->data_offset,
+                struct bio *pbio = r1_bio->bios[primary];
-                                                         s<<9,
+                struct bio *sbio = r1_bio->bios[i];
-                                                         bio->bi_io_vec[idx].bv_page,
+                int size;
-                                                         READ)) {
-                                                success = 1;
+                if (r1_bio->bios[i]->bi_end_io != end_sync_read)
-                                                break;
+                        continue;
-                                        }
-                                }
+                if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
-                                d++;
+                        for (j = vcnt; j-- ; ) {
-                                if (d == conf->raid_disks)
+                                struct page *p, *s;
-                                        d = 0;
+                                p = pbio->bi_io_vec[j].bv_page;
-                        } while (!success && d != r1_bio->read_disk);
+                                s = sbio->bi_io_vec[j].bv_page;
+                                if (memcmp(page_address(p),
-                        if (success) {
+                                           page_address(s),
-                                int start = d;
+                                           PAGE_SIZE))
-                                /* write it back and re-read */
+                                        break;
-                                set_bit(R1BIO_Uptodate, &r1_bio->state);
-                                while (d != r1_bio->read_disk) {
-                                        if (d == 0)
-                                                d = conf->raid_disks;
-                                        d--;
-                                        if (r1_bio->bios[d]->bi_end_io != end_sync_read)
-                                                continue;
-                                        rdev = conf->mirrors[d].rdev;
-                                        atomic_add(s, &rdev->corrected_errors);
-                                        if (sync_page_io(rdev->bdev,
-                                                         sect + rdev->data_offset,
-                                                         s<<9,
-                                                         bio->bi_io_vec[idx].bv_page,
-                                                         WRITE) == 0)
-                                                md_error(mddev, rdev);
-                                }
-                                d = start;
-                                while (d != r1_bio->read_disk) {
-                                        if (d == 0)
-                                                d = conf->raid_disks;
-                                        d--;
-                                        if (r1_bio->bios[d]->bi_end_io != end_sync_read)
-                                                continue;
-                                        rdev = conf->mirrors[d].rdev;
-                                        if (sync_page_io(rdev->bdev,
-                                                         sect + rdev->data_offset,
-                                                         s<<9,
-                                                         bio->bi_io_vec[idx].bv_page,
-                                                         READ) == 0)
-                                                md_error(mddev, rdev);
-                                }
-                        } else {
-                                char b[BDEVNAME_SIZE];
-                                /* Cannot read from anywhere, array is toast */
-                                md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
-                                printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
-                                       " for block %llu\n",
-                                       mdname(mddev),
-                                       bdevname(bio->bi_bdev, b),
-                                       (unsigned long long)r1_bio->sector);
-                                md_done_sync(mddev, r1_bio->sectors, 0);
-                                put_buf(r1_bio);
-                                return;
                        }
-                        sectors -= s;
+                } else
-                        sect += s;
+                        j = 0;
-                        idx ++;
+                if (j >= 0)
+                        mddev->resync_mismatches += r1_bio->sectors;
+                if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
+                              && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
+                        /* No need to write to this device. */
+                        sbio->bi_end_io = NULL;
+                        rdev_dec_pending(conf->mirrors[i].rdev, mddev);
+                        continue;
+                }
+                /* fixup the bio for reuse */
+                sbio->bi_vcnt = vcnt;
+                sbio->bi_size = r1_bio->sectors << 9;
+                sbio->bi_idx = 0;
+                sbio->bi_phys_segments = 0;
+                sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
+                sbio->bi_flags |= 1 << BIO_UPTODATE;
+                sbio->bi_next = NULL;
+                sbio->bi_sector = r1_bio->sector +
+                        conf->mirrors[i].rdev->data_offset;
+                sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+                size = sbio->bi_size;
+                for (j = 0; j < vcnt ; j++) {
+                        struct bio_vec *bi;
+                        bi = &sbio->bi_io_vec[j];
+                        bi->bv_offset = 0;
+                        if (size > PAGE_SIZE)
+                                bi->bv_len = PAGE_SIZE;
+                        else
+                                bi->bv_len = size;
+                        size -= PAGE_SIZE;
+                        memcpy(page_address(bi->bv_page),
+                               page_address(pbio->bi_io_vec[j].bv_page),
+                               PAGE_SIZE);
                }
        }
+        return 0;
+}
+static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
+{
+        conf_t *conf = mddev->private;
+        int i;
+        int disks = conf->raid_disks;
+        struct bio *bio, *wbio;
+        bio = r1_bio->bios[r1_bio->read_disk];
+        if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
+                /* ouch - failed to read all of that. */
+                if (!fix_sync_read_error(r1_bio))
+                        return;
+        if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+                if (process_checks(r1_bio) < 0)
+                        return;
        /*
         * schedule writes
         */
@@ -1536,10 +1446,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
                            test_bit(In_sync, &rdev->flags) &&
-                            sync_page_io(rdev->bdev,
+                            sync_page_io(rdev, sect, s<<9,
-                                         sect + rdev->data_offset,
+                                         conf->tmppage, READ, false))
-                                         s<<9,
-                                         conf->tmppage, READ))
                                success = 1;
                        else {
                                d++;
@@ -1562,9 +1470,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
                            test_bit(In_sync, &rdev->flags)) {
-                                if (sync_page_io(rdev->bdev,
+                                if (sync_page_io(rdev, sect, s<<9,
-                                                 sect + rdev->data_offset,
+                                                 conf->tmppage, WRITE, false)
-                                                 s<<9, conf->tmppage, WRITE)
                                    == 0)
                                        /* Well, this device is dead */
                                        md_error(mddev, rdev);
@@ -1579,9 +1486,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
                            test_bit(In_sync, &rdev->flags)) {
-                                if (sync_page_io(rdev->bdev,
+                                if (sync_page_io(rdev, sect, s<<9,
-                                                 sect + rdev->data_offset,
+                                                 conf->tmppage, READ, false)
-                                                 s<<9, conf->tmppage, READ)
                                    == 0)
                                        /* Well, this device is dead */
                                        md_error(mddev, rdev);
@@ -1609,15 +1515,17 @@ static void raid1d(mddev_t *mddev)
        unsigned long flags;
        conf_t *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
-        int unplug=0;
        mdk_rdev_t *rdev;
+        struct blk_plug plug;
        md_check_recovery(mddev);
-        
+        blk_start_plug(&plug);
        for (;;) {
                char b[BDEVNAME_SIZE];
-                unplug += flush_pending_writes(conf);
+                if (atomic_read(&mddev->plug_cnt) == 0)
+                        flush_pending_writes(conf);
                spin_lock_irqsave(&conf->device_lock, flags);
                if (list_empty(head)) {
@@ -1631,45 +1539,9 @@ static void raid1d(mddev_t *mddev)
                mddev = r1_bio->mddev;
                conf = mddev->private;
-                if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
+                if (test_bit(R1BIO_IsSync, &r1_bio->state))
                        sync_request_write(mddev, r1_bio);
-                        unplug = 1;
+                else {
-                } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
-                        /* some requests in the r1bio were REQ_HARDBARRIER
-                         * requests which failed with -EOPNOTSUPP.  Hohumm..
-                         * Better resubmit without the barrier.
-                         * We know which devices to resubmit for, because
-                         * all others have had their bios[] entry cleared.
-                         * We already have a nr_pending reference on these rdevs.
-                         */
-                        int i;
-                        const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
-                        clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
-                        clear_bit(R1BIO_Barrier, &r1_bio->state);
-                        for (i=0; i < conf->raid_disks; i++)
-                                if (r1_bio->bios[i])
-                                        atomic_inc(&r1_bio->remaining);
-                        for (i=0; i < conf->raid_disks; i++)
-                                if (r1_bio->bios[i]) {
-                                        struct bio_vec *bvec;
-                                        int j;
-                                        bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
-                                        /* copy pages from the failed bio, as
-                                         * this might be a write-behind device */
-                                        __bio_for_each_segment(bvec, bio, j, 0)
-                                                bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
-                                        bio_put(r1_bio->bios[i]);
-                                        bio->bi_sector = r1_bio->sector +
-                                                conf->mirrors[i].rdev->data_offset;
-                                        bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-                                        bio->bi_end_io = raid1_end_write_request;
-                                        bio->bi_rw = WRITE | do_sync;
-                                        bio->bi_private = r1_bio;
-                                        r1_bio->bios[i] = bio;
-                                        generic_make_request(bio);
-                                }
-                } else {
                        int disk;
                        /* we got a read error. Maybe the drive is bad.  Maybe just
@@ -1704,7 +1576,8 @@ static void raid1d(mddev_t *mddev)
                                        mddev->ro ? IO_BLOCKED : NULL;
                                r1_bio->read_disk = disk;
                                bio_put(bio);
-                                bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
+                                bio = bio_clone_mddev(r1_bio->master_bio,
+                                                      GFP_NOIO, mddev);
                                r1_bio->bios[r1_bio->read_disk] = bio;
                                rdev = conf->mirrors[disk].rdev;
                                if (printk_ratelimit())
@@ -1718,14 +1591,12 @@ static void raid1d(mddev_t *mddev)
                                bio->bi_end_io = raid1_end_read_request;
                                bio->bi_rw = READ | do_sync;
                                bio->bi_private = r1_bio;
-                                unplug = 1;
                                generic_make_request(bio);
                        }
                }
                cond_resched();
        }
-        if (unplug)
+        blk_finish_plug(&plug);
-                unplug_slaves(mddev);
 }
@@ -1763,7 +1634,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
        int i;
        int wonly = -1;
        int write_targets = 0, read_targets = 0;
-        int sync_blocks;
+        sector_t sync_blocks;
        int still_degraded = 0;
        if (!conf->r1buf_pool)
@@ -1813,11 +1684,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                msleep_interruptible(1000);
        bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+        r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
        raise_barrier(conf);
        conf->next_resync = sector_nr;
-        r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
        rcu_read_lock();
        /*
         * If we get a correctably read error during resync or recovery,
@@ -2029,7 +1900,6 @@ static conf_t *setup_conf(mddev_t *mddev)
        init_waitqueue_head(&conf->wait_barrier);
        bio_list_init(&conf->pending_bio_list);
-        bio_list_init(&conf->flushing_bio_list);
        conf->last_used = -1;
        for (i = 0; i < conf->raid_disks; i++) {
@@ -2107,8 +1977,9 @@ static int run(mddev_t *mddev)
        if (IS_ERR(conf))
                return PTR_ERR(conf);
-        mddev->queue->queue_lock = &conf->device_lock;
        list_for_each_entry(rdev, &mddev->disks, same_set) {
+                if (!mddev->gendisk)
+                        continue;
                disk_stack_limits(mddev->gendisk, rdev->bdev,
                                  rdev->data_offset << 9);
                /* as we don't honour merge_bvec_fn, we must never risk
@@ -2150,11 +2021,11 @@ static int run(mddev_t *mddev)
        md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
-        mddev->queue->unplug_fn = raid1_unplug;
+        if (mddev->queue) {
-        mddev->queue->backing_dev_info.congested_fn = raid1_congested;
+                mddev->queue->backing_dev_info.congested_fn = raid1_congested;
-        mddev->queue->backing_dev_info.congested_data = mddev;
+                mddev->queue->backing_dev_info.congested_data = mddev;
-        md_integrity_register(mddev);
+        }
-        return 0;
+        return md_integrity_register(mddev);
 }
 static int stop(mddev_t *mddev)
@@ -2176,7 +2047,6 @@ static int stop(mddev_t *mddev)
        md_unregister_thread(mddev->thread);
        mddev->thread = NULL;
-        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
        if (conf->r1bio_pool)
                mempool_destroy(conf->r1bio_pool);
        kfree(conf->mirrors);
@@ -2201,7 +2071,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
        set_capacity(mddev->gendisk, mddev->array_sectors);
        revalidate_disk(mddev->gendisk);
        if (sectors > mddev->dev_sectors &&
-            mddev->recovery_cp == MaxSector) {
+            mddev->recovery_cp > mddev->dev_sectors) {
                mddev->recovery_cp = mddev->dev_sectors;
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        }
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 5f2d443ae28a..e743a64fac4f 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -35,8 +35,6 @@ struct r1_private_data_s {
        struct list_head        retry_list;
        /* queue pending writes and submit them on unplug */
        struct bio_list         pending_bio_list;
-        /* queue of writes that have been unplugged */
-        struct bio_list         flushing_bio_list;
        /* for use when syncing mirrors: */
@@ -96,7 +94,9 @@ struct r1bio_s {
        int                     read_disk;
        struct list_head        retry_list;
-        struct bitmap_update    *bitmap_update;
+        /* Next two are only valid when R1BIO_BehindIO is set */
+        struct page             **behind_pages;
+        int                     behind_page_count;
        /*
         * if the IO is in WRITE direction, then multiple bios are used.
         * We choose the number when they are allocated.
@@ -117,8 +117,6 @@ struct r1bio_s {
 #define R1BIO_IsSync    1
 #define R1BIO_Degraded  2
 #define R1BIO_BehindIO  3
-#define R1BIO_Barrier   4
-#define R1BIO_BarrierRetry 5
 /* For write-behind requests, we call bi_end_io when
 * the last non-write-behind device completes, providing
 * any write was successful.  Otherwise we call when
@@ -128,4 +126,6 @@ struct r1bio_s {
 */
 #define R1BIO_Returned 6
+extern int md_raid1_congested(mddev_t *mddev, int bits);
 #endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84718383124d..6e846688962f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -5,7 +5,7 @@
 *
 * RAID-10 support for md.
 *
- * Base on code in raid1.c.  See raid1.c for futher copyright information.
+ * Base on code in raid1.c.  See raid1.c for further copyright information.
 *
 *
 * This program is free software; you can redistribute it and/or modify
@@ -57,23 +57,16 @@
 */
 #define NR_RAID10_BIOS 256
-static void unplug_slaves(mddev_t *mddev);
 static void allow_barrier(conf_t *conf);
 static void lower_barrier(conf_t *conf);
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
        conf_t *conf = data;
-        r10bio_t *r10_bio;
        int size = offsetof(struct r10bio_s, devs[conf->copies]);
        /* allocate a r10bio with room for raid_disks entries in the bios array */
-        r10_bio = kzalloc(size, gfp_flags);
+        return kzalloc(size, gfp_flags);
-        if (!r10_bio && conf->mddev)
-                unplug_slaves(conf->mddev);
-        return r10_bio;
 }
 static void r10bio_pool_free(void *r10_bio, void *data)
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
        int nalloc;
        r10_bio = r10bio_pool_alloc(gfp_flags, conf);
-        if (!r10_bio) {
+        if (!r10_bio)
-                unplug_slaves(conf->mddev);
                return NULL;
-        }
        if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
                nalloc = conf->copies; /* resync */
@@ -120,7 +111,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
         * Allocate bios.
         */
        for (j = nalloc ; j-- ; ) {
-                bio = bio_alloc(gfp_flags, RESYNC_PAGES);
+                bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
                if (!bio)
                        goto out_free_bio;
                r10_bio->devs[j].bio = bio;
@@ -280,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
                 */
                set_bit(R10BIO_Uptodate, &r10_bio->state);
                raid_end_bio_io(r10_bio);
+                rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
        } else {
                /*
-                 * oops, read error:
+                 * oops, read error - keep the refcount on the rdev
                 */
                char b[BDEVNAME_SIZE];
                if (printk_ratelimit())
@@ -291,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
                               bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
                reschedule_retry(r10_bio);
        }
-        rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 }
 static void raid10_end_write_request(struct bio *bio, int error)
@@ -349,14 +339,14 @@ static void raid10_end_write_request(struct bio *bio, int error)
 /*
 * RAID10 layout manager
- * Aswell as the chunksize and raid_disks count, there are two
+ * As well as the chunksize and raid_disks count, there are two
 * parameters: near_copies and far_copies.
 * near_copies * far_copies must be <= raid_disks.
 * Normally one of these will be 1.
 * If both are 1, we get raid0.
 * If near_copies == raid_disks, we get raid1.
 *
- * Chunks are layed out in raid0 style with near_copies copies of the
+ * Chunks are laid out in raid0 style with near_copies copies of the
 * first chunk, followed by near_copies copies of the next chunk and
 * so on.
 * If far_copies > 1, then after 1/far_copies of the array has been assigned
@@ -497,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 {
        const sector_t this_sector = r10_bio->sector;
-        int disk, slot, nslot;
+        int disk, slot;
        const int sectors = r10_bio->sectors;
-        sector_t new_distance, current_distance;
+        sector_t new_distance, best_dist;
        mdk_rdev_t *rdev;
+        int do_balance;
+        int best_slot;
        raid10_find_phys(conf, r10_bio);
        rcu_read_lock();
+retry:
+        best_slot = -1;
+        best_dist = MaxSector;
+        do_balance = 1;
        /*
         * Check if we can balance. We can balance on the whole
         * device if no resync is going on (recovery is ok), or below
@@ -511,123 +507,64 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
         * above the resync window.
         */
        if (conf->mddev->recovery_cp < MaxSector
-            && (this_sector + sectors >= conf->next_resync)) {
+            && (this_sector + sectors >= conf->next_resync))
-                /* make sure that disk is operational */
+                do_balance = 0;
-                slot = 0;
-                disk = r10_bio->devs[slot].devnum;
-                while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
+        for (slot = 0; slot < conf->copies ; slot++) {
-                       r10_bio->devs[slot].bio == IO_BLOCKED ||
+                if (r10_bio->devs[slot].bio == IO_BLOCKED)
-                       !test_bit(In_sync, &rdev->flags)) {
+                        continue;
-                        slot++;
-                        if (slot == conf->copies) {
-                                slot = 0;
-                                disk = -1;
-                                break;
-                        }
-                        disk = r10_bio->devs[slot].devnum;
-                }
-                goto rb_out;
-        }
-        /* make sure the disk is operational */
-        slot = 0;
-        disk = r10_bio->devs[slot].devnum;
-        while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
-               r10_bio->devs[slot].bio == IO_BLOCKED ||
-               !test_bit(In_sync, &rdev->flags)) {
-                slot ++;
-                if (slot == conf->copies) {
-                        disk = -1;
-                        goto rb_out;
-                }
                disk = r10_bio->devs[slot].devnum;
-        }
+                rdev = rcu_dereference(conf->mirrors[disk].rdev);
+                if (rdev == NULL)
+                        continue;
-        current_distance = abs(r10_bio->devs[slot].addr -
+                if (!test_bit(In_sync, &rdev->flags))
-                               conf->mirrors[disk].head_position);
-        /* Find the disk whose head is closest,
-         * or - for far > 1 - find the closest to partition beginning */
-        for (nslot = slot; nslot < conf->copies; nslot++) {
-                int ndisk = r10_bio->devs[nslot].devnum;
-                if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
-                    r10_bio->devs[nslot].bio == IO_BLOCKED ||
-                    !test_bit(In_sync, &rdev->flags))
                        continue;
+                if (!do_balance)
+                        break;
                /* This optimisation is debatable, and completely destroys
                 * sequential read speed for 'far copies' arrays.  So only
                 * keep it for 'near' arrays, and review those later.
                 */
-                if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
+                if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
-                        disk = ndisk;
-                        slot = nslot;
                        break;
-                }
                /* for far > 1 always use the lowest address */
                if (conf->far_copies > 1)
-                        new_distance = r10_bio->devs[nslot].addr;
+                        new_distance = r10_bio->devs[slot].addr;
                else
-                        new_distance = abs(r10_bio->devs[nslot].addr -
+                        new_distance = abs(r10_bio->devs[slot].addr -
-                                           conf->mirrors[ndisk].head_position);
+                                           conf->mirrors[disk].head_position);
-                if (new_distance < current_distance) {
+                if (new_distance < best_dist) {
-                        current_distance = new_distance;
+                        best_dist = new_distance;
-                        disk = ndisk;
+                        best_slot = slot;
-                        slot = nslot;
                }
        }
+        if (slot == conf->copies)
+                slot = best_slot;
-rb_out:
+        if (slot >= 0) {
-        r10_bio->read_slot = slot;
+                disk = r10_bio->devs[slot].devnum;
-/*      conf->next_seq_sect = this_sector + sectors;*/
+                rdev = rcu_dereference(conf->mirrors[disk].rdev);
+                if (!rdev)
-        if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
+                        goto retry;
-                atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
+                atomic_inc(&rdev->nr_pending);
-        else
+                if (test_bit(Faulty, &rdev->flags)) {
+                        /* Cannot risk returning a device that failed
+                         * before we inc'ed nr_pending
+                         */
+                        rdev_dec_pending(rdev, conf->mddev);
+                        goto retry;
+                }
+                r10_bio->read_slot = slot;
+        } else
                disk = -1;
        rcu_read_unlock();
        return disk;
 }
-static void unplug_slaves(mddev_t *mddev)
-{
-        conf_t *conf = mddev->private;
-        int i;
-        rcu_read_lock();
-        for (i=0; i < conf->raid_disks; i++) {
-                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-                if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-                        struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-                        atomic_inc(&rdev->nr_pending);
-                        rcu_read_unlock();
-                        blk_unplug(r_queue);
-                        rdev_dec_pending(rdev, mddev);
-                        rcu_read_lock();
-                }
-        }
-        rcu_read_unlock();
-}
-static void raid10_unplug(struct request_queue *q)
-{
-        mddev_t *mddev = q->queuedata;
-        unplug_slaves(q->queuedata);
-        md_wakeup_thread(mddev->thread);
-}
 static int raid10_congested(void *data, int bits)
 {
        mddev_t *mddev = data;
@@ -649,20 +586,16 @@ static int raid10_congested(void *data, int bits)
        return ret;
 }
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
 {
        /* Any writes that have been queued but are awaiting
         * bitmap updates get flushed here.
-         * We return 1 if any requests were actually submitted.
         */
-        int rv = 0;
        spin_lock_irq(&conf->device_lock);
        if (conf->pending_bio_list.head) {
                struct bio *bio;
                bio = bio_list_get(&conf->pending_bio_list);
-                blk_remove_plug(conf->mddev->queue);
                spin_unlock_irq(&conf->device_lock);
                /* flush any pending bitmap writes to disk
                 * before proceeding w/ I/O */
@@ -674,11 +607,10 @@ static int flush_pending_writes(conf_t *conf)
                        generic_make_request(bio);
                        bio = next;
                }
-                rv = 1;
        } else
                spin_unlock_irq(&conf->device_lock);
-        return rv;
 }
 /* Barriers....
 * Sometimes we need to suspend IO while we do something else,
 * either some resync/recovery, or reconfigure the array.
@@ -708,17 +640,15 @@ static void raise_barrier(conf_t *conf, int force)
        /* Wait until no block IO is waiting (unless 'force') */
        wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-                            conf->resync_lock,
+                            conf->resync_lock, );
-                            raid10_unplug(conf->mddev->queue));
        /* block any new IO from starting */
        conf->barrier++;
-        /* No wait for all pending IO to complete */
+        /* Now wait for all pending IO to complete */
        wait_event_lock_irq(conf->wait_barrier,
                            !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-                            conf->resync_lock,
+                            conf->resync_lock, );
-                            raid10_unplug(conf->mddev->queue));
        spin_unlock_irq(&conf->resync_lock);
 }
@@ -739,7 +669,7 @@ static void wait_barrier(conf_t *conf)
                conf->nr_waiting++;
                wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
                                    conf->resync_lock,
-                                    raid10_unplug(conf->mddev->queue));
+                                    );
                conf->nr_waiting--;
        }
        conf->nr_pending++;
@@ -775,8 +705,8 @@ static void freeze_array(conf_t *conf)
        wait_event_lock_irq(conf->wait_barrier,
                            conf->nr_pending == conf->nr_queued+1,
                            conf->resync_lock,
-                            ({ flush_pending_writes(conf);
+                            flush_pending_writes(conf));
-                               raid10_unplug(conf->mddev->queue); }));
        spin_unlock_irq(&conf->resync_lock);
 }
@@ -800,12 +730,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        int chunk_sects = conf->chunk_mask + 1;
        const int rw = bio_data_dir(bio);
        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
-        struct bio_list bl;
+        const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
        unsigned long flags;
        mdk_rdev_t *blocked_rdev;
+        int plugged;
-        if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-                md_barrier_request(mddev, bio);
+                md_flush_request(mddev, bio);
                return 0;
        }
@@ -889,7 +820,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                }
                mirror = conf->mirrors + disk;
-                read_bio = bio_clone(bio, GFP_NOIO);
+                read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
                r10_bio->devs[slot].bio = read_bio;
@@ -911,6 +842,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
         * inc refcount on their rdev.  Record them by setting
         * bios[x] to bio
         */
+        plugged = mddev_check_plugged(mddev);
        raid10_find_phys(conf, r10_bio);
 retry_write:
        blocked_rdev = NULL;
@@ -949,48 +882,46 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                goto retry_write;
        }
-        atomic_set(&r10_bio->remaining, 0);
+        atomic_set(&r10_bio->remaining, 1);
+        bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
-        bio_list_init(&bl);
        for (i = 0; i < conf->copies; i++) {
                struct bio *mbio;
                int d = r10_bio->devs[i].devnum;
                if (!r10_bio->devs[i].bio)
                        continue;
-                mbio = bio_clone(bio, GFP_NOIO);
+                mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
                r10_bio->devs[i].bio = mbio;
                mbio->bi_sector = r10_bio->devs[i].addr+
                        conf->mirrors[d].rdev->data_offset;
                mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
                mbio->bi_end_io = raid10_end_write_request;
-                mbio->bi_rw = WRITE | do_sync;
+                mbio->bi_rw = WRITE | do_sync | do_fua;
                mbio->bi_private = r10_bio;
                atomic_inc(&r10_bio->remaining);
-                bio_list_add(&bl, mbio);
+                spin_lock_irqsave(&conf->device_lock, flags);
+                bio_list_add(&conf->pending_bio_list, mbio);
+                spin_unlock_irqrestore(&conf->device_lock, flags);
        }
-        if (unlikely(!atomic_read(&r10_bio->remaining))) {
+        if (atomic_dec_and_test(&r10_bio->remaining)) {
-                /* the array is dead */
+                /* This matches the end of raid10_end_write_request() */
+                bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
+                                r10_bio->sectors,
+                                !test_bit(R10BIO_Degraded, &r10_bio->state),
+                                0);
                md_write_end(mddev);
                raid_end_bio_io(r10_bio);
-                return 0;
        }
-        bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
-        spin_lock_irqsave(&conf->device_lock, flags);
-        bio_list_merge(&conf->pending_bio_list, &bl);
-        blk_plug_device(mddev->queue);
-        spin_unlock_irqrestore(&conf->device_lock, flags);
        /* In case raid10d snuck in to freeze_array */
        wake_up(&conf->wait_barrier);
-        if (do_sync)
+        if (do_sync || !mddev->bitmap || !plugged)
                md_wakeup_thread(mddev->thread);
        return 0;
 }
@@ -1051,8 +982,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
        }
        set_bit(Faulty, &rdev->flags);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
-        printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n"
+        printk(KERN_ALERT
-               KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n",
+               "md/raid10:%s: Disk failure on %s, disabling device.\n"
+               "md/raid10:%s: Operation continuing on %d devices.\n",
               mdname(mddev), bdevname(rdev->bdev, b),
               mdname(mddev), conf->raid_disks - mddev->degraded);
 }
@@ -1229,7 +1161,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
                        p->rdev = rdev;
                        goto abort;
                }
-                md_integrity_register(mddev);
+                err = md_integrity_register(mddev);
        }
 abort:
@@ -1505,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
        int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
        int d = r10_bio->devs[r10_bio->read_slot].devnum;
-        rcu_read_lock();
+        /* still own a reference to this rdev, so it cannot
-        rdev = rcu_dereference(conf->mirrors[d].rdev);
+         * have been cleared recently.
-        if (rdev) { /* If rdev is not NULL */
+         */
-                char b[BDEVNAME_SIZE];
+        rdev = conf->mirrors[d].rdev;
-                int cur_read_error_count = 0;
-                bdevname(rdev->bdev, b);
+        if (test_bit(Faulty, &rdev->flags))
+                /* drive has already been failed, just ignore any
+                   more fix_read_error() attempts */
+                return;
-                if (test_bit(Faulty, &rdev->flags)) {
+        check_decay_read_errors(mddev, rdev);
-                        rcu_read_unlock();
+        atomic_inc(&rdev->read_errors);
-                        /* drive has already been failed, just ignore any
+        if (atomic_read(&rdev->read_errors) > max_read_errors) {
-                           more fix_read_error() attempts */
+                char b[BDEVNAME_SIZE];
-                        return;
+                bdevname(rdev->bdev, b);
-                }
-                check_decay_read_errors(mddev, rdev);
+                printk(KERN_NOTICE
-                atomic_inc(&rdev->read_errors);
+                       "md/raid10:%s: %s: Raid device exceeded "
-                cur_read_error_count = atomic_read(&rdev->read_errors);
+                       "read_error threshold [cur %d:max %d]\n",
-                if (cur_read_error_count > max_read_errors) {
+                       mdname(mddev), b,
-                        rcu_read_unlock();
+                       atomic_read(&rdev->read_errors), max_read_errors);
-                        printk(KERN_NOTICE
+                printk(KERN_NOTICE
-                               "md/raid10:%s: %s: Raid device exceeded "
+                       "md/raid10:%s: %s: Failing raid device\n",
-                               "read_error threshold "
+                       mdname(mddev), b);
-                               "[cur %d:max %d]\n",
+                md_error(mddev, conf->mirrors[d].rdev);
-                               mdname(mddev),
+                return;
-                               b, cur_read_error_count, max_read_errors);
-                        printk(KERN_NOTICE
-                               "md/raid10:%s: %s: Failing raid "
-                               "device\n", mdname(mddev), b);
-                        md_error(mddev, conf->mirrors[d].rdev);
-                        return;
-                }
        }
-        rcu_read_unlock();
        while(sectors) {
                int s = sectors;
@@ -1557,11 +1482,11 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                            test_bit(In_sync, &rdev->flags)) {
                                atomic_inc(&rdev->nr_pending);
                                rcu_read_unlock();
-                                success = sync_page_io(rdev->bdev,
+                                success = sync_page_io(rdev,
                                                       r10_bio->devs[sl].addr +
-                                                       sect + rdev->data_offset,
+                                                       sect,
                                                       s<<9,
-                                                       conf->tmppage, READ);
+                                                       conf->tmppage, READ, false);
                                rdev_dec_pending(rdev, mddev);
                                rcu_read_lock();
                                if (success)
@@ -1596,10 +1521,10 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                atomic_inc(&rdev->nr_pending);
                                rcu_read_unlock();
                                atomic_add(s, &rdev->corrected_errors);
-                                if (sync_page_io(rdev->bdev,
+                                if (sync_page_io(rdev,
                                                 r10_bio->devs[sl].addr +
-                                                 sect + rdev->data_offset,
+                                                 sect,
-                                                 s<<9, conf->tmppage, WRITE)
+                                                 s<<9, conf->tmppage, WRITE, false)
                                    == 0) {
                                        /* Well, this device is dead */
                                        printk(KERN_NOTICE
@@ -1607,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "write failed"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                               (unsigned long long)(sect+
+                                               (unsigned long long)(
-                                               rdev->data_offset),
+                                                       sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                        printk(KERN_NOTICE "md/raid10:%s: %s: failing "
                                               "drive\n",
@@ -1633,19 +1558,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                char b[BDEVNAME_SIZE];
                                atomic_inc(&rdev->nr_pending);
                                rcu_read_unlock();
-                                if (sync_page_io(rdev->bdev,
+                                if (sync_page_io(rdev,
                                                 r10_bio->devs[sl].addr +
-                                                 sect + rdev->data_offset,
+                                                 sect,
                                                 s<<9, conf->tmppage,
-                                                 READ) == 0) {
+                                                 READ, false) == 0) {
                                        /* Well, this device is dead */
                                        printk(KERN_NOTICE
                                               "md/raid10:%s: unable to read back "
                                               "corrected sectors"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                               (unsigned long long)(sect+
+                                               (unsigned long long)(
-                                                    rdev->data_offset),
+                                                       sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                        printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
                                               mdname(mddev),
@@ -1657,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "md/raid10:%s: read error corrected"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                               (unsigned long long)(sect+
+                                               (unsigned long long)(
-                                                    rdev->data_offset),
+                                                       sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                }
@@ -1680,15 +1605,16 @@ static void raid10d(mddev_t *mddev)
        unsigned long flags;
        conf_t *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
-        int unplug=0;
        mdk_rdev_t *rdev;
+        struct blk_plug plug;
        md_check_recovery(mddev);
+        blk_start_plug(&plug);
        for (;;) {
                char b[BDEVNAME_SIZE];
-                unplug += flush_pending_writes(conf);
+                flush_pending_writes(conf);
                spin_lock_irqsave(&conf->device_lock, flags);
                if (list_empty(head)) {
@@ -1702,14 +1628,13 @@ static void raid10d(mddev_t *mddev)
                mddev = r10_bio->mddev;
                conf = mddev->private;
-                if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
+                if (test_bit(R10BIO_IsSync, &r10_bio->state))
                        sync_request_write(mddev, r10_bio);
-                        unplug = 1;
+                else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
-                } else  if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
                        recovery_request_write(mddev, r10_bio);
-                        unplug = 1;
+                else {
-                } else {
+                        int slot = r10_bio->read_slot;
-                        int mirror;
+                        int mirror = r10_bio->devs[slot].devnum;
                        /* we got a read error. Maybe the drive is bad.  Maybe just
                         * the block and we can fix it.
                         * We freeze all other IO, and try reading the block from
@@ -1723,9 +1648,10 @@ static void raid10d(mddev_t *mddev)
                                fix_read_error(conf, mddev, r10_bio);
                                unfreeze_array(conf);
                        }
+                        rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
-                        bio = r10_bio->devs[r10_bio->read_slot].bio;
+                        bio = r10_bio->devs[slot].bio;
-                        r10_bio->devs[r10_bio->read_slot].bio =
+                        r10_bio->devs[slot].bio =
                                mddev->ro ? IO_BLOCKED : NULL;
                        mirror = read_balance(conf, r10_bio);
                        if (mirror == -1) {
@@ -1739,6 +1665,7 @@ static void raid10d(mddev_t *mddev)
                        } else {
                                const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
                                bio_put(bio);
+                                slot = r10_bio->read_slot;
                                rdev = conf->mirrors[mirror].rdev;
                                if (printk_ratelimit())
                                        printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
@@ -1746,22 +1673,21 @@ static void raid10d(mddev_t *mddev)
                                               mdname(mddev),
                                               bdevname(rdev->bdev,b),
                                               (unsigned long long)r10_bio->sector);
-                                bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
+                                bio = bio_clone_mddev(r10_bio->master_bio,
-                                r10_bio->devs[r10_bio->read_slot].bio = bio;
+                                                      GFP_NOIO, mddev);
-                                bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
+                                r10_bio->devs[slot].bio = bio;
+                                bio->bi_sector = r10_bio->devs[slot].addr
                                        + rdev->data_offset;
                                bio->bi_bdev = rdev->bdev;
                                bio->bi_rw = READ | do_sync;
                                bio->bi_private = r10_bio;
                                bio->bi_end_io = raid10_end_read_request;
-                                unplug = 1;
                                generic_make_request(bio);
                        }
                }
                cond_resched();
        }
-        if (unplug)
+        blk_finish_plug(&plug);
-                unplug_slaves(mddev);
 }
@@ -1810,16 +1736,16 @@ static int init_resync(conf_t *conf)
 *
 */
-static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
+                             int *skipped, int go_faster)
 {
        conf_t *conf = mddev->private;
        r10bio_t *r10_bio;
        struct bio *biolist = NULL, *bio;
        sector_t max_sector, nr_sectors;
-        int disk;
        int i;
        int max_sync;
-        int sync_blocks;
+        sector_t sync_blocks;
        sector_t sectors_skipped = 0;
        int chunks_skipped = 0;
@@ -1905,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                int j, k;
                r10_bio = NULL;
-                for (i=0 ; i<conf->raid_disks; i++)
+                for (i=0 ; i<conf->raid_disks; i++) {
-                        if (conf->mirrors[i].rdev &&
+                        int still_degraded;
-                            !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
+                        r10bio_t *rb2;
-                                int still_degraded = 0;
+                        sector_t sect;
-                                /* want to reconstruct this device */
+                        int must_sync;
-                                r10bio_t *rb2 = r10_bio;
-                                sector_t sect = raid10_find_virt(conf, sector_nr, i);
-                                int must_sync;
-                                /* Unless we are doing a full sync, we only need
-                                 * to recover the block if it is set in the bitmap
-                                 */
-                                must_sync = bitmap_start_sync(mddev->bitmap, sect,
-                                                              &sync_blocks, 1);
-                                if (sync_blocks < max_sync)
-                                        max_sync = sync_blocks;
-                                if (!must_sync &&
-                                    !conf->fullsync) {
-                                        /* yep, skip the sync_blocks here, but don't assume
-                                         * that there will never be anything to do here
-                                         */
-                                        chunks_skipped = -1;
-                                        continue;
-                                }
-                                r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+                        if (conf->mirrors[i].rdev == NULL ||
-                                raise_barrier(conf, rb2 != NULL);
+                            test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 
-                                atomic_set(&r10_bio->remaining, 0);
+                                continue;
-                                r10_bio->master_bio = (struct bio*)rb2;
+                        still_degraded = 0;
-                                if (rb2)
+                        /* want to reconstruct this device */
-                                        atomic_inc(&rb2->remaining);
+                        rb2 = r10_bio;
-                                r10_bio->mddev = mddev;
+                        sect = raid10_find_virt(conf, sector_nr, i);
-                                set_bit(R10BIO_IsRecover, &r10_bio->state);
+                        /* Unless we are doing a full sync, we only need
-                                r10_bio->sector = sect;
+                         * to recover the block if it is set in the bitmap
+                         */
+                        must_sync = bitmap_start_sync(mddev->bitmap, sect,
+                                                      &sync_blocks, 1);
+                        if (sync_blocks < max_sync)
+                                max_sync = sync_blocks;
+                        if (!must_sync &&
+                            !conf->fullsync) {
+                                /* yep, skip the sync_blocks here, but don't assume
+                                 * that there will never be anything to do here
+                                 */
+                                chunks_skipped = -1;
+                                continue;
+                        }
-                                raid10_find_phys(conf, r10_bio);
+                        r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+                        raise_barrier(conf, rb2 != NULL);
+                        atomic_set(&r10_bio->remaining, 0);
-                                /* Need to check if the array will still be
+                        r10_bio->master_bio = (struct bio*)rb2;
-                                 * degraded
+                        if (rb2)
-                                 */
+                                atomic_inc(&rb2->remaining);
-                                for (j=0; j<conf->raid_disks; j++)
+                        r10_bio->mddev = mddev;
-                                        if (conf->mirrors[j].rdev == NULL ||
+                        set_bit(R10BIO_IsRecover, &r10_bio->state);
-                                            test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
+                        r10_bio->sector = sect;
-                                                still_degraded = 1;
-                                                break;
-                                        }
-                                must_sync = bitmap_start_sync(mddev->bitmap, sect,
-                                                              &sync_blocks, still_degraded);
-                                for (j=0; j<conf->copies;j++) {
-                                        int d = r10_bio->devs[j].devnum;
-                                        if (conf->mirrors[d].rdev &&
-                                            test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
-                                                /* This is where we read from */
-                                                bio = r10_bio->devs[0].bio;
-                                                bio->bi_next = biolist;
-                                                biolist = bio;
-                                                bio->bi_private = r10_bio;
-                                                bio->bi_end_io = end_sync_read;
-                                                bio->bi_rw = READ;
-                                                bio->bi_sector = r10_bio->devs[j].addr +
-                                                        conf->mirrors[d].rdev->data_offset;
-                                                bio->bi_bdev = conf->mirrors[d].rdev->bdev;
-                                                atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-                                                atomic_inc(&r10_bio->remaining);
-                                                /* and we write to 'i' */
-                                                for (k=0; k<conf->copies; k++)
-                                                        if (r10_bio->devs[k].devnum == i)
-                                                                break;
-                                                BUG_ON(k == conf->copies);
-                                                bio = r10_bio->devs[1].bio;
-                                                bio->bi_next = biolist;
-                                                biolist = bio;
-                                                bio->bi_private = r10_bio;
-                                                bio->bi_end_io = end_sync_write;
-                                                bio->bi_rw = WRITE;
-                                                bio->bi_sector = r10_bio->devs[k].addr +
-                                                        conf->mirrors[i].rdev->data_offset;
-                                                bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-                                                r10_bio->devs[0].devnum = d;
-                                                r10_bio->devs[1].devnum = i;
-                                                break;
+                        raid10_find_phys(conf, r10_bio);
-                                        }
-                                }
+                        /* Need to check if the array will still be
-                                if (j == conf->copies) {
+                         * degraded
-                                        /* Cannot recover, so abort the recovery */
+                         */
-                                        put_buf(r10_bio);
+                        for (j=0; j<conf->raid_disks; j++)
-                                        if (rb2)
+                                if (conf->mirrors[j].rdev == NULL ||
-                                                atomic_dec(&rb2->remaining);
+                                    test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
-                                        r10_bio = rb2;
+                                        still_degraded = 1;
-                                        if (!test_and_set_bit(MD_RECOVERY_INTR,
-                                                              &mddev->recovery))
-                                                printk(KERN_INFO "md/raid10:%s: insufficient "
-                                                       "working devices for recovery.\n",
-                                                       mdname(mddev));
                                        break;
                                }
+                        must_sync = bitmap_start_sync(mddev->bitmap, sect,
+                                                      &sync_blocks, still_degraded);
+                        for (j=0; j<conf->copies;j++) {
+                                int d = r10_bio->devs[j].devnum;
+                                if (!conf->mirrors[d].rdev ||
+                                    !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
+                                        continue;
+                                /* This is where we read from */
+                                bio = r10_bio->devs[0].bio;
+                                bio->bi_next = biolist;
+                                biolist = bio;
+                                bio->bi_private = r10_bio;
+                                bio->bi_end_io = end_sync_read;
+                                bio->bi_rw = READ;
+                                bio->bi_sector = r10_bio->devs[j].addr +
+                                        conf->mirrors[d].rdev->data_offset;
+                                bio->bi_bdev = conf->mirrors[d].rdev->bdev;
+                                atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+                                atomic_inc(&r10_bio->remaining);
+                                /* and we write to 'i' */
+                                for (k=0; k<conf->copies; k++)
+                                        if (r10_bio->devs[k].devnum == i)
+                                                break;
+                                BUG_ON(k == conf->copies);
+                                bio = r10_bio->devs[1].bio;
+                                bio->bi_next = biolist;
+                                biolist = bio;
+                                bio->bi_private = r10_bio;
+                                bio->bi_end_io = end_sync_write;
+                                bio->bi_rw = WRITE;
+                                bio->bi_sector = r10_bio->devs[k].addr +
+                                        conf->mirrors[i].rdev->data_offset;
+                                bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+                                r10_bio->devs[0].devnum = d;
+                                r10_bio->devs[1].devnum = i;
+                                break;
+                        }
+                        if (j == conf->copies) {
+                                /* Cannot recover, so abort the recovery */
+                                put_buf(r10_bio);
+                                if (rb2)
+                                        atomic_dec(&rb2->remaining);
+                                r10_bio = rb2;
+                                if (!test_and_set_bit(MD_RECOVERY_INTR,
+                                                      &mddev->recovery))
+                                        printk(KERN_INFO "md/raid10:%s: insufficient "
+                                               "working devices for recovery.\n",
+                                               mdname(mddev));
+                                break;
                        }
+                }
                if (biolist == NULL) {
                        while (r10_bio) {
                                r10bio_t *rb2 = r10_bio;
@@ -2024,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                if (!bitmap_start_sync(mddev->bitmap, sector_nr,
                                       &sync_blocks, mddev->degraded) &&
-                    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+                    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
+                                                 &mddev->recovery)) {
                        /* We can skip this block */
                        *skipped = 1;
                        return sync_blocks + sectors_skipped;
@@ -2069,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                        for (i=0; i<conf->copies; i++) {
                                int d = r10_bio->devs[i].devnum;
                                if (r10_bio->devs[i].bio->bi_end_io)
-                                        rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+                                        rdev_dec_pending(conf->mirrors[d].rdev,
+                                                         mddev);
                        }
                        put_buf(r10_bio);
                        biolist = NULL;
@@ -2094,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
        do {
                struct page *page;
                int len = PAGE_SIZE;
-                disk = 0;
                if (sector_nr + (len>>9) > max_sector)
                        len = (max_sector - sector_nr) << 9;
                if (len == 0)
                        break;
                for (bio= biolist ; bio ; bio=bio->bi_next) {
+                        struct bio *bio2;
                        page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
-                        if (bio_add_page(bio, page, len, 0) == 0) {
+                        if (bio_add_page(bio, page, len, 0))
-                                /* stop here */
+                                continue;
-                                struct bio *bio2;
-                                bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+                        /* stop here */
-                                for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
+                        bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
-                                        /* remove last page from this bio */
+                        for (bio2 = biolist;
-                                        bio2->bi_vcnt--;
+                             bio2 && bio2 != bio;
-                                        bio2->bi_size -= len;
+                             bio2 = bio2->bi_next) {
-                                        bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
+                                /* remove last page from this bio */
-                                }
+                                bio2->bi_vcnt--;
-                                goto bio_full;
+                                bio2->bi_size -= len;
+                                bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
                        }
-                        disk = i;
+                        goto bio_full;
                }
                nr_sectors += len>>9;
                sector_nr += len>>9;
@@ -2302,8 +2237,6 @@ static int run(mddev_t *mddev)
        if (!conf)
                goto out;
-        mddev->queue->queue_lock = &conf->device_lock;
        mddev->thread = conf->thread;
        conf->thread = NULL;
@@ -2374,7 +2307,6 @@ static int run(mddev_t *mddev)
        md_set_array_sectors(mddev, size);
        mddev->resync_max_sectors = size;
-        mddev->queue->unplug_fn = raid10_unplug;
        mddev->queue->backing_dev_info.congested_fn = raid10_congested;
        mddev->queue->backing_dev_info.congested_data = mddev;
@@ -2392,17 +2324,20 @@ static int run(mddev_t *mddev)
        if (conf->near_copies < conf->raid_disks)
                blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
-        md_integrity_register(mddev);
+        if (md_integrity_register(mddev))
+                goto out_free_conf;
        return 0;
 out_free_conf:
+        md_unregister_thread(mddev->thread);
        if (conf->r10bio_pool)
                mempool_destroy(conf->r10bio_pool);
        safe_put_page(conf->tmppage);
        kfree(conf->mirrors);
        kfree(conf);
        mddev->private = NULL;
-        md_unregister_thread(mddev->thread);
 out:
        return -EIO;
 }
@@ -2461,11 +2396,13 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
        mddev->recovery_cp = MaxSector;
        conf = setup_conf(mddev);
-        if (!IS_ERR(conf))
+        if (!IS_ERR(conf)) {
                list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0)
                                rdev->new_raid_disk = rdev->raid_disk * 2;
-                
+                conf->barrier = 1;
+        }
        return conf;
 }
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 2316ac2e8e21..944b1104d3b4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -17,8 +17,8 @@ struct r10_private_data_s {
        spinlock_t              device_lock;
        /* geometry */
-        int                     near_copies;  /* number of copies layed out raid0 style */
+        int                     near_copies;  /* number of copies laid out raid0 style */
-        int                     far_copies;   /* number of copies layed out
+        int                     far_copies;   /* number of copies laid out
                                               * at large strides across drives
                                               */
        int                     far_offset;   /* far_copies are offset by 1 stripe
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 69b0a169e43d..b72edf35ec54 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -27,12 +27,12 @@
 *
 * We group bitmap updates into batches.  Each batch has a number.
 * We may write out several batches at once, but that isn't very important.
- * conf->bm_write is the number of the last batch successfully written.
+ * conf->seq_write is the number of the last batch successfully written.
- * conf->bm_flush is the number of the last batch that was closed to
+ * conf->seq_flush is the number of the last batch that was closed to
 *    new additions.
 * When we discover that we will need to write to any block in a stripe
 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
- * the number of the batch it will be in. This is bm_flush+1.
+ * the number of the batch it will be in. This is seq_flush+1.
 * When we are ready to do a write, if that batch hasn't been written yet,
 *   we plug the array and queue the stripe for later.
 * When an unplug happens, we increment bm_flush, thus closing the current
@@ -129,7 +129,7 @@ static inline int raid5_dec_bi_hw_segments(struct bio *bio)
 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
 {
-        bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
+        bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
 }
 /* Find first data disk in a raid6 stripe */
@@ -199,14 +199,12 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
                BUG_ON(!list_empty(&sh->lru));
                BUG_ON(atomic_read(&conf->active_stripes)==0);
                if (test_bit(STRIPE_HANDLE, &sh->state)) {
-                        if (test_bit(STRIPE_DELAYED, &sh->state)) {
+                        if (test_bit(STRIPE_DELAYED, &sh->state))
                                list_add_tail(&sh->lru, &conf->delayed_list);
-                                plugger_set_plug(&conf->plug);
+                        else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
-                        } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+                                   sh->bm_seq - conf->seq_write > 0)
-                                   sh->bm_seq - conf->seq_write > 0) {
                                list_add_tail(&sh->lru, &conf->bitmap_list);
-                                plugger_set_plug(&conf->plug);
+                        else {
-                        } else {
                                clear_bit(STRIPE_BIT_DELAY, &sh->state);
                                list_add_tail(&sh->lru, &conf->handle_list);
                        }
@@ -433,8 +431,6 @@ static int has_failed(raid5_conf_t *conf)
        return 0;
 }
-static void unplug_slaves(mddev_t *mddev);
 static struct stripe_head *
 get_active_stripe(raid5_conf_t *conf, sector_t sector,
                  int previous, int noblock, int noquiesce)
@@ -463,8 +459,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
                                                     < (conf->max_nr_stripes *3/4)
                                                     || !conf->inactive_blocked),
                                                    conf->device_lock,
-                                                    md_raid5_unplug_device(conf)
+                                                    );
-                                        );
                                conf->inactive_blocked = 0;
                        } else
                                init_stripe(sh, sector, previous);
@@ -506,9 +501,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                int rw;
                struct bio *bi;
                mdk_rdev_t *rdev;
-                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
+                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
-                        rw = WRITE;
+                        if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
-                else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+                                rw = WRITE_FUA;
+                        else
+                                rw = WRITE;
+                } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
                        rw = READ;
                else
                        continue;
@@ -516,7 +514,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                bi = &sh->dev[i].req;
                bi->bi_rw = rw;
-                if (rw == WRITE)
+                if (rw & WRITE)
                        bi->bi_end_io = raid5_end_write_request;
                else
                        bi->bi_end_io = raid5_end_read_request;
@@ -550,13 +548,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                        bi->bi_io_vec[0].bv_offset = 0;
                        bi->bi_size = STRIPE_SIZE;
                        bi->bi_next = NULL;
-                        if (rw == WRITE &&
+                        if ((rw & WRITE) &&
                            test_bit(R5_ReWrite, &sh->dev[i].flags))
                                atomic_add(STRIPE_SECTORS,
                                        &rdev->corrected_errors);
                        generic_make_request(bi);
                } else {
-                        if (rw == WRITE)
+                        if (rw & WRITE)
                                set_bit(STRIPE_DEGRADED, &sh->state);
                        pr_debug("skip op %ld on disc %d for sector %llu\n",
                                bi->bi_rw, i, (unsigned long long)sh->sector);
@@ -587,7 +585,7 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
        init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
        bio_for_each_segment(bvl, bio, i) {
-                int len = bio_iovec_idx(bio, i)->bv_len;
+                int len = bvl->bv_len;
                int clen;
                int b_offset = 0;
@@ -603,8 +601,8 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
                        clen = len;
                if (clen > 0) {
-                        b_offset += bio_iovec_idx(bio, i)->bv_offset;
+                        b_offset += bvl->bv_offset;
-                        bio_page = bio_iovec_idx(bio, i)->bv_page;
+                        bio_page = bvl->bv_page;
                        if (frombio)
                                tx = async_memcpy(page, bio_page, page_offset,
                                                  b_offset, clen, &submit);
@@ -1031,6 +1029,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
                        while (wbi && wbi->bi_sector <
                                dev->sector + STRIPE_SECTORS) {
+                                if (wbi->bi_rw & REQ_FUA)
+                                        set_bit(R5_WantFUA, &dev->flags);
                                tx = async_copy_data(1, wbi, dev->page,
                                        dev->sector, tx);
                                wbi = r5_next_bio(wbi, dev->sector);
@@ -1048,15 +1048,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
        int pd_idx = sh->pd_idx;
        int qd_idx = sh->qd_idx;
        int i;
+        bool fua = false;
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
+        for (i = disks; i--; )
+                fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
        for (i = disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
-                if (dev->written || i == pd_idx || i == qd_idx)
+                if (dev->written || i == pd_idx || i == qd_idx) {
                        set_bit(R5_UPTODATE, &dev->flags);
+                        if (fua)
+                                set_bit(R5_WantFUA, &dev->flags);
+                }
        }
        if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -1461,8 +1468,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
                wait_event_lock_irq(conf->wait_for_stripe,
                                    !list_empty(&conf->inactive_list),
                                    conf->device_lock,
-                                    unplug_slaves(conf->mddev)
+                                    );
-                        );
                osh = get_free_stripe(conf);
                spin_unlock_irq(&conf->device_lock);
                atomic_set(&nsh->count, 1);
@@ -1694,28 +1700,25 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
        raid5_conf_t *conf = mddev->private;
        pr_debug("raid456: error called\n");
-        if (!test_bit(Faulty, &rdev->flags)) {
+        if (test_and_clear_bit(In_sync, &rdev->flags)) {
-                set_bit(MD_CHANGE_DEVS, &mddev->flags);
+                unsigned long flags;
-                if (test_and_clear_bit(In_sync, &rdev->flags)) {
+                spin_lock_irqsave(&conf->device_lock, flags);
-                        unsigned long flags;
+                mddev->degraded++;
-                        spin_lock_irqsave(&conf->device_lock, flags);
+                spin_unlock_irqrestore(&conf->device_lock, flags);
-                        mddev->degraded++;
+                /*
-                        spin_unlock_irqrestore(&conf->device_lock, flags);
+                 * if recovery was running, make sure it aborts.
-                        /*
+                 */
-                         * if recovery was running, make sure it aborts.
+                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                         */
-                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                }
-                set_bit(Faulty, &rdev->flags);
-                printk(KERN_ALERT
-                       "md/raid:%s: Disk failure on %s, disabling device.\n"
-                       KERN_ALERT
-                       "md/raid:%s: Operation continuing on %d devices.\n",
-                       mdname(mddev),
-                       bdevname(rdev->bdev, b),
-                       mdname(mddev),
-                       conf->raid_disks - mddev->degraded);
        }
+        set_bit(Faulty, &rdev->flags);
+        set_bit(MD_CHANGE_DEVS, &mddev->flags);
+        printk(KERN_ALERT
+               "md/raid:%s: Disk failure on %s, disabling device.\n"
+               "md/raid:%s: Operation continuing on %d devices.\n",
+               mdname(mddev),
+               bdevname(rdev->bdev, b),
+               mdname(mddev),
+               conf->raid_disks - mddev->degraded);
 }
 /*
@@ -3281,7 +3284,7 @@ static void handle_stripe5(struct stripe_head *sh)
        if (dec_preread_active) {
                /* We delay this until after ops_run_io so that if make_request
-                 * is waiting on a barrier, it won't continue until the writes
+                 * is waiting on a flush, it won't continue until the writes
                 * have actually been submitted.
                 */
                atomic_dec(&conf->preread_active_stripes);
@@ -3583,7 +3586,7 @@ static void handle_stripe6(struct stripe_head *sh)
        if (dec_preread_active) {
                /* We delay this until after ops_run_io so that if make_request
-                 * is waiting on a barrier, it won't continue until the writes
+                 * is waiting on a flush, it won't continue until the writes
                 * have actually been submitted.
                 */
                atomic_dec(&conf->preread_active_stripes);
@@ -3616,8 +3619,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
                                atomic_inc(&conf->preread_active_stripes);
                        list_add_tail(&sh->lru, &conf->hold_list);
                }
-        } else
+        }
-                plugger_set_plug(&conf->plug);
 }
 static void activate_bit_delay(raid5_conf_t *conf)
@@ -3634,60 +3636,6 @@ static void activate_bit_delay(raid5_conf_t *conf)
        }
 }
-static void unplug_slaves(mddev_t *mddev)
-{
-        raid5_conf_t *conf = mddev->private;
-        int i;
-        int devs = max(conf->raid_disks, conf->previous_raid_disks);
-        rcu_read_lock();
-        for (i = 0; i < devs; i++) {
-                mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-                if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-                        struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-                        atomic_inc(&rdev->nr_pending);
-                        rcu_read_unlock();
-                        blk_unplug(r_queue);
-                        rdev_dec_pending(rdev, mddev);
-                        rcu_read_lock();
-                }
-        }
-        rcu_read_unlock();
-}
-void md_raid5_unplug_device(raid5_conf_t *conf)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&conf->device_lock, flags);
-        if (plugger_remove_plug(&conf->plug)) {
-                conf->seq_flush++;
-                raid5_activate_delayed(conf);
-        }
-        md_wakeup_thread(conf->mddev->thread);
-        spin_unlock_irqrestore(&conf->device_lock, flags);
-        unplug_slaves(conf->mddev);
-}
-EXPORT_SYMBOL_GPL(md_raid5_unplug_device);
-static void raid5_unplug(struct plug_handle *plug)
-{
-        raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug);
-        md_raid5_unplug_device(conf);
-}
-static void raid5_unplug_queue(struct request_queue *q)
-{
-        mddev_t *mddev = q->queuedata;
-        md_raid5_unplug_device(mddev->private);
-}
 int md_raid5_congested(mddev_t *mddev, int bits)
 {
        raid5_conf_t *conf = mddev->private;
@@ -3864,9 +3812,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
                return 0;
        }
        /*
-         * use bio_clone to make a copy of the bio
+         * use bio_clone_mddev to make a copy of the bio
         */
-        align_bi = bio_clone(raid_bio, GFP_NOIO);
+        align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
        if (!align_bi)
                return 0;
        /*
@@ -3977,15 +3925,10 @@ static int make_request(mddev_t *mddev, struct bio * bi)
        struct stripe_head *sh;
        const int rw = bio_data_dir(bi);
        int remaining;
+        int plugged;
-        if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) {
+        if (unlikely(bi->bi_rw & REQ_FLUSH)) {
-                /* Drain all pending writes.  We only really need
+                md_flush_request(mddev, bi);
-                 * to ensure they have been submitted, but this is
-                 * easier.
-                 */
-                mddev->pers->quiesce(mddev, 1);
-                mddev->pers->quiesce(mddev, 0);
-                md_barrier_request(mddev, bi);
                return 0;
        }
@@ -4001,6 +3944,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
        bi->bi_next = NULL;
        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
+        plugged = mddev_check_plugged(mddev);
        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
                DEFINE_WAIT(w);
                int disks, data_disks;
@@ -4014,7 +3958,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
                        /* spinlock is needed as reshape_progress may be
                         * 64bit on a 32bit platform, and so it might be
                         * possible to see a half-updated value
-                         * Ofcourse reshape_progress could change after
+                         * Of course reshape_progress could change after
                         * the lock is dropped, so once we get a reference
                         * to the stripe that we think it is, we will have
                         * to check again.
@@ -4095,7 +4039,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
                                 * add failed due to overlap.  Flush everything
                                 * and wait a while
                                 */
-                                md_raid5_unplug_device(conf);
+                                md_wakeup_thread(mddev->thread);
                                release_stripe(sh);
                                schedule();
                                goto retry;
@@ -4103,7 +4047,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
                        finish_wait(&conf->wait_for_overlap, &w);
                        set_bit(STRIPE_HANDLE, &sh->state);
                        clear_bit(STRIPE_DELAYED, &sh->state);
-                        if (mddev->barrier && 
+                        if ((bi->bi_rw & REQ_SYNC) &&
                            !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                atomic_inc(&conf->preread_active_stripes);
                        release_stripe(sh);
@@ -4115,6 +4059,9 @@ static int make_request(mddev_t *mddev, struct bio * bi)
                }
                        
        }
+        if (!plugged)
+                md_wakeup_thread(mddev->thread);
        spin_lock_irq(&conf->device_lock);
        remaining = raid5_dec_bi_phys_segments(bi);
        spin_unlock_irq(&conf->device_lock);
@@ -4126,13 +4073,6 @@ static int make_request(mddev_t *mddev, struct bio * bi)
                bio_endio(bi, 0);
        }
-        if (mddev->barrier) {
-                /* We need to wait for the stripes to all be handled.
-                 * So: wait for preread_active_stripes to drop to 0.
-                 */
-                wait_event(mddev->thread->wqueue,
-                           atomic_read(&conf->preread_active_stripes) == 0);
-        }
        return 0;
 }
@@ -4238,7 +4178,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes)==0);
                mddev->reshape_position = conf->reshape_progress;
-                mddev->curr_resync_completed = mddev->curr_resync;
+                mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
@@ -4339,7 +4279,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes) == 0);
                mddev->reshape_position = conf->reshape_progress;
-                mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
+                mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
@@ -4361,13 +4301,12 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
        raid5_conf_t *conf = mddev->private;
        struct stripe_head *sh;
        sector_t max_sector = mddev->dev_sectors;
-        int sync_blocks;
+        sector_t sync_blocks;
        int still_degraded = 0;
        int i;
        if (sector_nr >= max_sector) {
                /* just being told to finish up .. nothing much to do */
-                unplug_slaves(mddev);
                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
                        end_reshape(conf);
@@ -4524,24 +4463,30 @@ static void raid5d(mddev_t *mddev)
        struct stripe_head *sh;
        raid5_conf_t *conf = mddev->private;
        int handled;
+        struct blk_plug plug;
        pr_debug("+++ raid5d active\n");
        md_check_recovery(mddev);
+        blk_start_plug(&plug);
        handled = 0;
        spin_lock_irq(&conf->device_lock);
        while (1) {
                struct bio *bio;
-                if (conf->seq_flush != conf->seq_write) {
+                if (atomic_read(&mddev->plug_cnt) == 0 &&
-                        int seq = conf->seq_flush;
+                    !list_empty(&conf->bitmap_list)) {
+                        /* Now is a good time to flush some bitmap updates */
+                        conf->seq_flush++;
                        spin_unlock_irq(&conf->device_lock);
                        bitmap_unplug(mddev->bitmap);
                        spin_lock_irq(&conf->device_lock);
-                        conf->seq_write = seq;
+                        conf->seq_write = conf->seq_flush;
                        activate_bit_delay(conf);
                }
+                if (atomic_read(&mddev->plug_cnt) == 0)
+                        raid5_activate_delayed(conf);
                while ((bio = remove_bio_from_retry(conf))) {
                        int ok;
@@ -4571,7 +4516,7 @@ static void raid5d(mddev_t *mddev)
        spin_unlock_irq(&conf->device_lock);
        async_tx_issue_pending_all();
-        unplug_slaves(mddev);
+        blk_finish_plug(&plug);
        pr_debug("--- raid5d inactive\n");
 }
@@ -4913,7 +4858,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
                        printk(KERN_INFO "md/raid:%s: device %s operational as raid"
                               " disk %d\n",
                               mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
-                } else
+                } else if (rdev->saved_raid_disk != raid_disk)
                        /* Cannot rely on bitmap to complete recovery */
                        conf->fullsync = 1;
        }
@@ -5188,8 +5133,6 @@ static int run(mddev_t *mddev)
                       mdname(mddev));
        md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
-        plugger_init(&conf->plug, raid5_unplug);
-        mddev->plug = &conf->plug;
        if (mddev->queue) {
                int chunk_size;
                /* read-ahead size must cover two whole stripes, which
@@ -5206,8 +5149,6 @@ static int run(mddev_t *mddev)
                mddev->queue->backing_dev_info.congested_data = mddev;
                mddev->queue->backing_dev_info.congested_fn = raid5_congested;
-                mddev->queue->queue_lock = &conf->device_lock;
-                mddev->queue->unplug_fn = raid5_unplug_queue;
                chunk_size = mddev->chunk_sectors << 9;
                blk_queue_io_min(mddev->queue, chunk_size);
@@ -5240,7 +5181,6 @@ static int stop(mddev_t *mddev)
        mddev->thread = NULL;
        if (mddev->queue)
                mddev->queue->backing_dev_info.congested_fn = NULL;
-        plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/
        free_conf(conf);
        mddev->private = NULL;
        mddev->to_remove = &raid5_attrs_group;
@@ -5340,7 +5280,7 @@ static int raid5_spare_active(mddev_t *mddev)
                    && !test_bit(Faulty, &tmp->rdev->flags)
                    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
                        count++;
-                        sysfs_notify_dirent(tmp->rdev->sysfs_state);
+                        sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
                }
        }
        spin_lock_irqsave(&conf->device_lock, flags);
@@ -5449,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
                return -EINVAL;
        set_capacity(mddev->gendisk, mddev->array_sectors);
        revalidate_disk(mddev->gendisk);
-        if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
+        if (sectors > mddev->dev_sectors &&
+            mddev->recovery_cp > mddev->dev_sectors) {
                mddev->recovery_cp = mddev->dev_sectors;
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        }
@@ -5519,7 +5460,6 @@ static int raid5_start_reshape(mddev_t *mddev)
        raid5_conf_t *conf = mddev->private;
        mdk_rdev_t *rdev;
        int spares = 0;
-        int added_devices = 0;
        unsigned long flags;
        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -5529,8 +5469,8 @@ static int raid5_start_reshape(mddev_t *mddev)
                return -ENOSPC;
        list_for_each_entry(rdev, &mddev->disks, same_set)
-                if (rdev->raid_disk < 0 &&
+                if (!test_bit(In_sync, &rdev->flags)
-                    !test_bit(Faulty, &rdev->flags))
+                    && !test_bit(Faulty, &rdev->flags))
                        spares++;
        if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
@@ -5573,29 +5513,35 @@ static int raid5_start_reshape(mddev_t *mddev)
         * to correctly record the "partially reconstructed" state of
         * such devices during the reshape and confusion could result.
         */
-        if (mddev->delta_disks >= 0)
+        if (mddev->delta_disks >= 0) {
-            list_for_each_entry(rdev, &mddev->disks, same_set)
+                int added_devices = 0;
-                if (rdev->raid_disk < 0 &&
+                list_for_each_entry(rdev, &mddev->disks, same_set)
-                    !test_bit(Faulty, &rdev->flags)) {
+                        if (rdev->raid_disk < 0 &&
-                        if (raid5_add_disk(mddev, rdev) == 0) {
+                            !test_bit(Faulty, &rdev->flags)) {
-                                char nm[20];
+                                if (raid5_add_disk(mddev, rdev) == 0) {
-                                if (rdev->raid_disk >= conf->previous_raid_disks) {
+                                        char nm[20];
-                                        set_bit(In_sync, &rdev->flags);
+                                        if (rdev->raid_disk
-                                        added_devices++;
+                                            >= conf->previous_raid_disks) {
-                                } else
+                                                set_bit(In_sync, &rdev->flags);
-                                        rdev->recovery_offset = 0;
+                                                added_devices++;
-                                sprintf(nm, "rd%d", rdev->raid_disk);
+                                        } else
-                                if (sysfs_create_link(&mddev->kobj,
+                                                rdev->recovery_offset = 0;
-                                                      &rdev->kobj, nm))
+                                        sprintf(nm, "rd%d", rdev->raid_disk);
-                                        /* Failure here is OK */;
+                                        if (sysfs_create_link(&mddev->kobj,
-                        } else
+                                                              &rdev->kobj, nm))
-                                break;
+                                                /* Failure here is OK */;
-                }
+                                }
+                        } else if (rdev->raid_disk >= conf->previous_raid_disks
+                                   && !test_bit(Faulty, &rdev->flags)) {
+                                /* This is a spare that was manually added */
+                                set_bit(In_sync, &rdev->flags);
+                                added_devices++;
+                        }
-        /* When a reshape changes the number of devices, ->degraded
+                /* When a reshape changes the number of devices,
-         * is measured against the larger of the pre and post number of
+                 * ->degraded is measured against the larger of the
-         * devices.*/
+                 * pre and post number of devices.
-        if (mddev->delta_disks > 0) {
+                 */
                spin_lock_irqsave(&conf->device_lock, flags);
                mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
                        - added_devices;
@@ -5731,6 +5677,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 static void *raid45_takeover_raid0(mddev_t *mddev, int level)
 {
        struct raid0_private_data *raid0_priv = mddev->private;
+        sector_t sectors;
        /* for raid0 takeover only one zone is supported */
        if (raid0_priv->nr_strip_zones > 1) {
@@ -5739,6 +5686,9 @@ static void *raid45_takeover_raid0(mddev_t *mddev, int level)
                return ERR_PTR(-EINVAL);
        }
+        sectors = raid0_priv->strip_zone[0].zone_end;
+        sector_div(sectors, raid0_priv->strip_zone[0].nb_dev);
+        mddev->dev_sectors = sectors;
        mddev->new_level = level;
        mddev->new_layout = ALGORITHM_PARITY_N;
        mddev->new_chunk_sectors = mddev->chunk_sectors;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 36eaed5dfd6e..3ca77a2613ba 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -275,6 +275,7 @@ struct r6_state {
                                    * filling
                                    */
 #define R5_Wantdrain    13 /* dev->towrite needs to be drained */
+#define R5_WantFUA      14      /* Write should be FUA */
 /*
 * Write method
 */
@@ -399,8 +400,6 @@ struct raid5_private_data {
                                            * Cleared when a sync completes.
                                            */
-        struct plug_handle      plug;
        /* per cpu variables */
        struct raid5_percpu {
                struct page     *spare_page; /* Used when checking P/Q in raid6 */
@@ -502,6 +501,6 @@ static inline int algorithm_is_DDF(int layout)
 }
 extern int md_raid5_congested(mddev_t *mddev, int bits);
-extern void md_raid5_unplug_device(raid5_conf_t *conf);
+extern void md_raid5_kick_device(raid5_conf_t *conf);
 extern int raid5_set_cache_size(mddev_t *mddev, int size);
 #endif