Patched in Tegra support.

author: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-17 16:15:55 -0500
committer: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-17 16:15:55 -0500
commit: 8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
tree: a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /drivers/md
parent: 406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
75 files changed, 4259 insertions, 21669 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 91a02eeeb31..f75a66e7d31 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -208,23 +208,6 @@ config DM_DEBUG
          If unsure, say N.
-config DM_BUFIO
-       tristate
-       depends on BLK_DEV_DM && EXPERIMENTAL
-       ---help---
-         This interface allows you to do buffered I/O on a device and acts
-         as a cache, holding recently-read blocks in memory and performing
-         delayed writes.
-config DM_BIO_PRISON
-       tristate
-       depends on BLK_DEV_DM && EXPERIMENTAL
-       ---help---
-         Some bio locking schemes used by other device-mapper targets
-         including thin provisioning.
-source "drivers/md/persistent-data/Kconfig"
 config DM_CRYPT
        tristate "Crypt target support"
        depends on BLK_DEV_DM
@@ -250,24 +233,6 @@ config DM_SNAPSHOT
       ---help---
         Allow volume managers to take writable snapshots of a device.
-config DM_THIN_PROVISIONING
-       tristate "Thin provisioning target (EXPERIMENTAL)"
-       depends on BLK_DEV_DM && EXPERIMENTAL
-       select DM_PERSISTENT_DATA
-       select DM_BIO_PRISON
-       ---help---
-         Provides thin provisioning and snapshots that share a data store.
-config DM_DEBUG_BLOCK_STACK_TRACING
-        boolean "Keep stack trace of thin provisioning block lock holders"
-        depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING
-        select STACKTRACE
-        ---help---
-          Enable this for messages that may help debug problems with the
-          block manager locking used by thin provisioning.
-          If unsure, say N.
 config DM_MIRROR
       tristate "Mirror target"
       depends on BLK_DEV_DM
@@ -276,14 +241,13 @@ config DM_MIRROR
         needed for live data migration tools such as 'pvmove'.
 config DM_RAID
-       tristate "RAID 1/4/5/6/10 target"
+       tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
-       depends on BLK_DEV_DM
+       depends on BLK_DEV_DM && EXPERIMENTAL
       select MD_RAID1
-       select MD_RAID10
       select MD_RAID456
       select BLK_DEV_MD
       ---help---
-         A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings
+         A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings
         A RAID-5 set of N drives with a capacity of C MB per drive provides
         the capacity of C * (N - 1) MB, and protects against a failure
@@ -359,8 +323,8 @@ config DM_DELAY
        If unsure, say N.
 config DM_UEVENT
-        bool "DM uevents"
+        bool "DM uevents (EXPERIMENTAL)"
-        depends on BLK_DEV_DM
+        depends on BLK_DEV_DM && EXPERIMENTAL
        ---help---
        Generate udev events for DM events.
@@ -370,24 +334,4 @@ config DM_FLAKEY
       ---help---
         A target that intermittently fails I/O for debugging purposes.
-config DM_VERITY
-        tristate "Verity target support (EXPERIMENTAL)"
-        depends on BLK_DEV_DM && EXPERIMENTAL
-        select CRYPTO
-        select CRYPTO_HASH
-        select DM_BUFIO
-        ---help---
-          This device-mapper target creates a read-only device that
-          transparently validates the data on one underlying device against
-          a pre-generated tree of cryptographic checksums stored on a second
-          device.
-          You'll need to activate the digests you're going to use in the
-          cryptoapi configuration.
-          To compile this code as a module, choose M here: the module will
-          be called dm-verity.
-          If unsure, say N.
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 94dce8b4932..448838b1f92 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -10,7 +10,6 @@ dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 dm-mirror-y     += dm-raid1.o
 dm-log-userspace-y \
                += dm-log-userspace-base.o dm-log-userspace-transfer.o
-dm-thin-pool-y  += dm-thin.o dm-thin-metadata.o
 md-mod-y        += md.o bitmap.o
 raid456-y       += raid5.o
@@ -28,8 +27,6 @@ obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)         += faulty.o
 obj-$(CONFIG_BLK_DEV_MD)        += md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)        += dm-mod.o
-obj-$(CONFIG_DM_BUFIO)          += dm-bufio.o
-obj-$(CONFIG_DM_BIO_PRISON)     += dm-bio-prison.o
 obj-$(CONFIG_DM_CRYPT)          += dm-crypt.o
 obj-$(CONFIG_DM_DELAY)          += dm-delay.o
 obj-$(CONFIG_DM_FLAKEY)         += dm-flakey.o
@@ -37,13 +34,10 @@ obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)   += dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)   += dm-service-time.o
 obj-$(CONFIG_DM_SNAPSHOT)       += dm-snapshot.o
-obj-$(CONFIG_DM_PERSISTENT_DATA)        += persistent-data/
 obj-$(CONFIG_DM_MIRROR)         += dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_LOG_USERSPACE)  += dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)           += dm-zero.o
 obj-$(CONFIG_DM_RAID)   += dm-raid.o
-obj-$(CONFIG_DM_THIN_PROVISIONING)      += dm-thin-pool.o
-obj-$(CONFIG_DM_VERITY)         += dm-verity.o
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs                     += dm-uevent.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7155945f8eb..0dc6546b77a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -26,16 +26,73 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
-#include <linux/seq_file.h>
 #include "md.h"
 #include "bitmap.h"
+/* debug macros */
+#define DEBUG 0
+#if DEBUG
+/* these are for debugging purposes only! */
+/* define one and only one of these */
+#define INJECT_FAULTS_1 0 /* cause bitmap_alloc_page to fail always */
+#define INJECT_FAULTS_2 0 /* cause bitmap file to be kicked when first bit set*/
+#define INJECT_FAULTS_3 0 /* treat bitmap file as kicked at init time */
+#define INJECT_FAULTS_4 0 /* undef */
+#define INJECT_FAULTS_5 0 /* undef */
+#define INJECT_FAULTS_6 0
+/* if these are defined, the driver will fail! debug only */
+#define INJECT_FATAL_FAULT_1 0 /* fail kmalloc, causing bitmap_create to fail */
+#define INJECT_FATAL_FAULT_2 0 /* undef */
+#define INJECT_FATAL_FAULT_3 0 /* undef */
+#endif
+#ifndef PRINTK
+#  if DEBUG > 0
+#    define PRINTK(x...) printk(KERN_DEBUG x)
+#  else
+#    define PRINTK(x...)
+#  endif
+#endif
 static inline char *bmname(struct bitmap *bitmap)
 {
        return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
 }
 /*
+ * just a placeholder - calls kmalloc for bitmap pages
+ */
+static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
+{
+        unsigned char *page;
+#ifdef INJECT_FAULTS_1
+        page = NULL;
+#else
+        page = kzalloc(PAGE_SIZE, GFP_NOIO);
+#endif
+        if (!page)
+                printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
+        else
+                PRINTK("%s: bitmap_alloc_page: allocated page at %p\n",
+                        bmname(bitmap), page);
+        return page;
+}
+/*
+ * for now just a placeholder -- just calls kfree for bitmap pages
+ */
+static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
+{
+        PRINTK("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
+        kfree(page);
+}
+/*
 * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
 *
 * 1) check to see if this page is allocated, if it's not then try to alloc
@@ -45,7 +102,7 @@ static inline char *bmname(struct bitmap *bitmap)
 * if we find our page, we increment the page's refcount so that it stays
 * allocated while we're using it
 */
-static int bitmap_checkpage(struct bitmap_counts *bitmap,
+static int bitmap_checkpage(struct bitmap *bitmap,
                            unsigned long page, int create)
 __releases(bitmap->lock)
 __acquires(bitmap->lock)
@@ -72,11 +129,12 @@ __acquires(bitmap->lock)
        /* this page has not been allocated yet */
        spin_unlock_irq(&bitmap->lock);
-        mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
+        mappage = bitmap_alloc_page(bitmap);
        spin_lock_irq(&bitmap->lock);
        if (mappage == NULL) {
-                pr_debug("md/bitmap: map page allocation failed, hijacking\n");
+                PRINTK("%s: bitmap map page allocation failed, hijacking\n",
+                        bmname(bitmap));
                /* failed - set the hijacked flag so that we can use the
                 * pointer as a counter */
                if (!bitmap->bp[page].map)
@@ -84,7 +142,7 @@ __acquires(bitmap->lock)
        } else if (bitmap->bp[page].map ||
                   bitmap->bp[page].hijacked) {
                /* somebody beat us to getting the page */
-                kfree(mappage);
+                bitmap_free_page(bitmap, mappage);
                return 0;
        } else {
@@ -99,7 +157,7 @@ __acquires(bitmap->lock)
 /* if page is completely empty, put it back on the free list, or dealloc it */
 /* if page was hijacked, unmark the flag so it might get alloced next time */
 /* Note: lock should be held when calling this */
-static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
+static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
 {
        char *ptr;
@@ -116,7 +174,7 @@ static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
                ptr = bitmap->bp[page].map;
                bitmap->bp[page].map = NULL;
                bitmap->missing_pages++;
-                kfree(ptr);
+                bitmap_free_page(bitmap, ptr);
        }
 }
@@ -129,16 +187,24 @@ static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
 */
 /* IO operations when bitmap is stored near all superblocks */
-static int read_sb_page(struct mddev *mddev, loff_t offset,
+static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
-                        struct page *page,
+                                 struct page *page,
-                        unsigned long index, int size)
+                                 unsigned long index, int size)
 {
        /* choose a good rdev and read the page from there */
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        sector_t target;
+        int did_alloc = 0;
+        if (!page) {
+                page = alloc_page(GFP_KERNEL);
+                if (!page)
+                        return ERR_PTR(-ENOMEM);
+                did_alloc = 1;
+        }
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (! test_bit(In_sync, &rdev->flags)
                    || test_bit(Faulty, &rdev->flags))
                        continue;
@@ -149,13 +215,18 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
                                 roundup(size, bdev_logical_block_size(rdev->bdev)),
                                 page, READ, true)) {
                        page->index = index;
-                        return 0;
+                        attach_page_buffers(page, NULL); /* so that free_buffer will
+                                                          * quietly no-op */
+                        return page;
                }
        }
-        return -EIO;
+        if (did_alloc)
+                put_page(page);
+        return ERR_PTR(-EIO);
 }
-static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev)
+static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 {
        /* Iterate the disks of an mddev, using rcu to protect access to the
         * linked list, and raising the refcount of devices we return to ensure
@@ -163,17 +234,20 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
         * As devices are only added or removed when raid_disk is < 0 and
         * nr_pending is 0 and In_sync is clear, the entries we return will
         * still be in the same position on the list when we re-enter
-         * list_for_each_entry_continue_rcu.
+         * list_for_each_continue_rcu.
         */
+        struct list_head *pos;
        rcu_read_lock();
        if (rdev == NULL)
                /* start at the beginning */
-                rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set);
+                pos = &mddev->disks;
        else {
                /* release the previous rdev and start from there. */
                rdev_dec_pending(rdev, mddev);
+                pos = &rdev->same_set;
        }
-        list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) {
+        list_for_each_continue_rcu(pos, &mddev->disks) {
+                rdev = list_entry(pos, mdk_rdev_t, same_set);
                if (rdev->raid_disk >= 0 &&
                    !test_bit(Faulty, &rdev->flags)) {
                        /* this is a usable devices */
@@ -188,10 +262,9 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
 static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
-        struct md_rdev *rdev = NULL;
+        mdk_rdev_t *rdev = NULL;
        struct block_device *bdev;
-        struct mddev *mddev = bitmap->mddev;
+        mddev_t *mddev = bitmap->mddev;
-        struct bitmap_storage *store = &bitmap->storage;
        while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
                int size = PAGE_SIZE;
@@ -199,13 +272,9 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
                bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
-                if (page->index == store->file_pages-1) {
+                if (page->index == bitmap->file_pages-1)
-                        int last_page_size = store->bytes & (PAGE_SIZE-1);
+                        size = roundup(bitmap->last_page_size,
-                        if (last_page_size == 0)
-                                last_page_size = PAGE_SIZE;
-                        size = roundup(last_page_size,
                                       bdev_logical_block_size(bdev));
-                }
                /* Just make sure we aren't corrupting data or
                 * metadata
                 */
@@ -264,10 +333,10 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 {
        struct buffer_head *bh;
-        if (bitmap->storage.file == NULL) {
+        if (bitmap->file == NULL) {
                switch (write_sb_page(bitmap, page, wait)) {
                case -EINVAL:
-                        set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
+                        bitmap->flags |= BITMAP_WRITE_ERROR;
                }
        } else {
@@ -285,16 +354,20 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
                        wait_event(bitmap->write_wait,
                                   atomic_read(&bitmap->pending_writes)==0);
        }
-        if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
+        if (bitmap->flags & BITMAP_WRITE_ERROR)
                bitmap_file_kick(bitmap);
 }
 static void end_bitmap_write(struct buffer_head *bh, int uptodate)
 {
        struct bitmap *bitmap = bh->b_private;
+        unsigned long flags;
-        if (!uptodate)
+        if (!uptodate) {
-                set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
+                spin_lock_irqsave(&bitmap->lock, flags);
+                bitmap->flags |= BITMAP_WRITE_ERROR;
+                spin_unlock_irqrestore(&bitmap->lock, flags);
+        }
        if (atomic_dec_and_test(&bitmap->pending_writes))
                wake_up(&bitmap->write_wait);
 }
@@ -309,12 +382,8 @@ __clear_page_buffers(struct page *page)
 }
 static void free_buffers(struct page *page)
 {
-        struct buffer_head *bh;
+        struct buffer_head *bh = page_buffers(page);
-        if (!PagePrivate(page))
-                return;
-        bh = page_buffers(page);
        while (bh) {
                struct buffer_head *next = bh->b_this_page;
                free_buffer_head(bh);
@@ -331,22 +400,28 @@ static void free_buffers(struct page *page)
 * This usage is similar to how swap files are handled, and allows us
 * to write to a file with no concerns of memory allocation failing.
 */
-static int read_page(struct file *file, unsigned long index,
+static struct page *read_page(struct file *file, unsigned long index,
-                     struct bitmap *bitmap,
+                              struct bitmap *bitmap,
-                     unsigned long count,
+                              unsigned long count)
-                     struct page *page)
 {
-        int ret = 0;
+        struct page *page = NULL;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct buffer_head *bh;
        sector_t block;
-        pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
+        PRINTK("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
-                 (unsigned long long)index << PAGE_SHIFT);
+                        (unsigned long long)index << PAGE_SHIFT);
+        page = alloc_page(GFP_KERNEL);
+        if (!page)
+                page = ERR_PTR(-ENOMEM);
+        if (IS_ERR(page))
+                goto out;
        bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
        if (!bh) {
-                ret = -ENOMEM;
+                put_page(page);
+                page = ERR_PTR(-ENOMEM);
                goto out;
        }
        attach_page_buffers(page, bh);
@@ -358,7 +433,8 @@ static int read_page(struct file *file, unsigned long index,
                        bh->b_blocknr = bmap(inode, block);
                        if (bh->b_blocknr == 0) {
                                /* Cannot use this file! */
-                                ret = -EINVAL;
+                                free_buffers(page);
+                                page = ERR_PTR(-EINVAL);
                                goto out;
                        }
                        bh->b_bdev = inode->i_sb->s_bdev;
@@ -381,15 +457,17 @@ static int read_page(struct file *file, unsigned long index,
        wait_event(bitmap->write_wait,
                   atomic_read(&bitmap->pending_writes)==0);
-        if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
+        if (bitmap->flags & BITMAP_WRITE_ERROR) {
-                ret = -EIO;
+                free_buffers(page);
+                page = ERR_PTR(-EIO);
+        }
 out:
-        if (ret)
+        if (IS_ERR(page))
-                printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n",
+                printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n",
                        (int)PAGE_SIZE,
                        (unsigned long long)index << PAGE_SHIFT,
-                        ret);
+                        PTR_ERR(page));
-        return ret;
+        return page;
 }
 /*
@@ -400,14 +478,19 @@ out:
 void bitmap_update_sb(struct bitmap *bitmap)
 {
        bitmap_super_t *sb;
+        unsigned long flags;
        if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
                return;
        if (bitmap->mddev->bitmap_info.external)
                return;
-        if (!bitmap->storage.sb_page) /* no superblock */
+        spin_lock_irqsave(&bitmap->lock, flags);
+        if (!bitmap->sb_page) { /* no superblock */
+                spin_unlock_irqrestore(&bitmap->lock, flags);
                return;
-        sb = kmap_atomic(bitmap->storage.sb_page);
+        }
+        spin_unlock_irqrestore(&bitmap->lock, flags);
+        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
        sb->events = cpu_to_le64(bitmap->mddev->events);
        if (bitmap->mddev->events < bitmap->events_cleared)
                /* rocking back to read-only */
@@ -417,13 +500,8 @@ void bitmap_update_sb(struct bitmap *bitmap)
        /* Just in case these have been changed via sysfs: */
        sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
        sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
-        /* This might have been changed by a reshape */
+        kunmap_atomic(sb, KM_USER0);
-        sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
+        write_page(bitmap, bitmap->sb_page, 1);
-        sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
-        sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
-                                           bitmap_info.space);
-        kunmap_atomic(sb);
-        write_page(bitmap, bitmap->storage.sb_page, 1);
 }
 /* print out the bitmap file superblock */
@@ -431,9 +509,9 @@ void bitmap_print_sb(struct bitmap *bitmap)
 {
        bitmap_super_t *sb;
-        if (!bitmap || !bitmap->storage.sb_page)
+        if (!bitmap || !bitmap->sb_page)
                return;
-        sb = kmap_atomic(bitmap->storage.sb_page);
+        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
        printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
        printk(KERN_DEBUG "         magic: %08x\n", le32_to_cpu(sb->magic));
        printk(KERN_DEBUG "       version: %d\n", le32_to_cpu(sb->version));
@@ -452,7 +530,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
        printk(KERN_DEBUG "     sync size: %llu KB\n",
                        (unsigned long long)le64_to_cpu(sb->sync_size)/2);
        printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
-        kunmap_atomic(sb);
+        kunmap_atomic(sb, KM_USER0);
 }
 /*
@@ -470,13 +548,17 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 {
        bitmap_super_t *sb;
        unsigned long chunksize, daemon_sleep, write_behind;
+        int err = -EINVAL;
-        bitmap->storage.sb_page = alloc_page(GFP_KERNEL);
+        bitmap->sb_page = alloc_page(GFP_KERNEL);
-        if (bitmap->storage.sb_page == NULL)
+        if (IS_ERR(bitmap->sb_page)) {
-                return -ENOMEM;
+                err = PTR_ERR(bitmap->sb_page);
-        bitmap->storage.sb_page->index = 0;
+                bitmap->sb_page = NULL;
+                return err;
+        }
+        bitmap->sb_page->index = 0;
-        sb = kmap_atomic(bitmap->storage.sb_page);
+        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
        sb->magic = cpu_to_le32(BITMAP_MAGIC);
        sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
@@ -484,7 +566,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
        chunksize = bitmap->mddev->bitmap_info.chunksize;
        BUG_ON(!chunksize);
        if (!is_power_of_2(chunksize)) {
-                kunmap_atomic(sb);
+                kunmap_atomic(sb, KM_USER0);
                printk(KERN_ERR "bitmap chunksize not a power of 2\n");
                return -EINVAL;
        }
@@ -514,12 +596,15 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
        memcpy(sb->uuid, bitmap->mddev->uuid, 16);
-        set_bit(BITMAP_STALE, &bitmap->flags);
+        bitmap->flags |= BITMAP_STALE;
-        sb->state = cpu_to_le32(bitmap->flags);
+        sb->state |= cpu_to_le32(BITMAP_STALE);
        bitmap->events_cleared = bitmap->mddev->events;
        sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
-        kunmap_atomic(sb);
+        bitmap->flags |= BITMAP_HOSTENDIAN;
+        sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN);
+        kunmap_atomic(sb, KM_USER0);
        return 0;
 }
@@ -531,45 +616,31 @@ static int bitmap_read_sb(struct bitmap *bitmap)
        bitmap_super_t *sb;
        unsigned long chunksize, daemon_sleep, write_behind;
        unsigned long long events;
-        unsigned long sectors_reserved = 0;
        int err = -EINVAL;
-        struct page *sb_page;
-        if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
-                chunksize = 128 * 1024 * 1024;
-                daemon_sleep = 5 * HZ;
-                write_behind = 0;
-                set_bit(BITMAP_STALE, &bitmap->flags);
-                err = 0;
-                goto out_no_sb;
-        }
        /* page 0 is the superblock, read it... */
-        sb_page = alloc_page(GFP_KERNEL);
+        if (bitmap->file) {
-        if (!sb_page)
+                loff_t isize = i_size_read(bitmap->file->f_mapping->host);
-                return -ENOMEM;
-        bitmap->storage.sb_page = sb_page;
-        if (bitmap->storage.file) {
-                loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
                int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
-                err = read_page(bitmap->storage.file, 0,
+                bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes);
-                                bitmap, bytes, sb_page);
        } else {
-                err = read_sb_page(bitmap->mddev,
+                bitmap->sb_page = read_sb_page(bitmap->mddev,
-                                   bitmap->mddev->bitmap_info.offset,
+                                               bitmap->mddev->bitmap_info.offset,
-                                   sb_page,
+                                               NULL,
-                                   0, sizeof(bitmap_super_t));
+                                               0, sizeof(bitmap_super_t));
        }
-        if (err)
+        if (IS_ERR(bitmap->sb_page)) {
+                err = PTR_ERR(bitmap->sb_page);
+                bitmap->sb_page = NULL;
                return err;
+        }
-        sb = kmap_atomic(sb_page);
+        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
        chunksize = le32_to_cpu(sb->chunksize);
        daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
        write_behind = le32_to_cpu(sb->write_behind);
-        sectors_reserved = le32_to_cpu(sb->sectors_reserved);
        /* verify that the bitmap-specific fields are valid */
        if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -594,50 +665,81 @@ static int bitmap_read_sb(struct bitmap *bitmap)
        /* keep the array size field of the bitmap superblock up to date */
        sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
-        if (bitmap->mddev->persistent) {
+        if (!bitmap->mddev->persistent)
-                /*
+                goto success;
-                 * We have a persistent array superblock, so compare the
-                 * bitmap's UUID and event counter to the mddev's
-                 */
-                if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
-                        printk(KERN_INFO
-                               "%s: bitmap superblock UUID mismatch\n",
-                               bmname(bitmap));
-                        goto out;
-                }
-                events = le64_to_cpu(sb->events);
-                if (events < bitmap->mddev->events) {
-                        printk(KERN_INFO
-                               "%s: bitmap file is out of date (%llu < %llu) "
-                               "-- forcing full recovery\n",
-                               bmname(bitmap), events,
-                               (unsigned long long) bitmap->mddev->events);
-                        set_bit(BITMAP_STALE, &bitmap->flags);
-                }
-        }
+        /*
+         * if we have a persistent array superblock, compare the
+         * bitmap's UUID and event counter to the mddev's
+         */
+        if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
+                printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n",
+                        bmname(bitmap));
+                goto out;
+        }
+        events = le64_to_cpu(sb->events);
+        if (events < bitmap->mddev->events) {
+                printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) "
+                        "-- forcing full recovery\n", bmname(bitmap), events,
+                        (unsigned long long) bitmap->mddev->events);
+                sb->state |= cpu_to_le32(BITMAP_STALE);
+        }
+success:
        /* assign fields using values from superblock */
+        bitmap->mddev->bitmap_info.chunksize = chunksize;
+        bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
+        bitmap->mddev->bitmap_info.max_write_behind = write_behind;
        bitmap->flags |= le32_to_cpu(sb->state);
        if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
-                set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
+                bitmap->flags |= BITMAP_HOSTENDIAN;
        bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
+        if (bitmap->flags & BITMAP_STALE)
+                bitmap->events_cleared = bitmap->mddev->events;
        err = 0;
 out:
-        kunmap_atomic(sb);
+        kunmap_atomic(sb, KM_USER0);
-out_no_sb:
-        if (test_bit(BITMAP_STALE, &bitmap->flags))
-                bitmap->events_cleared = bitmap->mddev->events;
-        bitmap->mddev->bitmap_info.chunksize = chunksize;
-        bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
-        bitmap->mddev->bitmap_info.max_write_behind = write_behind;
-        if (bitmap->mddev->bitmap_info.space == 0 ||
-            bitmap->mddev->bitmap_info.space > sectors_reserved)
-                bitmap->mddev->bitmap_info.space = sectors_reserved;
        if (err)
                bitmap_print_sb(bitmap);
        return err;
 }
+enum bitmap_mask_op {
+        MASK_SET,
+        MASK_UNSET
+};
+/* record the state of the bitmap in the superblock.  Return the old value */
+static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
+                             enum bitmap_mask_op op)
+{
+        bitmap_super_t *sb;
+        unsigned long flags;
+        int old;
+        spin_lock_irqsave(&bitmap->lock, flags);
+        if (!bitmap->sb_page) { /* can't set the state */
+                spin_unlock_irqrestore(&bitmap->lock, flags);
+                return 0;
+        }
+        spin_unlock_irqrestore(&bitmap->lock, flags);
+        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+        old = le32_to_cpu(sb->state) & bits;
+        switch (op) {
+        case MASK_SET:
+                sb->state |= cpu_to_le32(bits);
+                bitmap->flags |= bits;
+                break;
+        case MASK_UNSET:
+                sb->state &= cpu_to_le32(~bits);
+                bitmap->flags &= ~bits;
+                break;
+        default:
+                BUG();
+        }
+        kunmap_atomic(sb, KM_USER0);
+        return old;
+}
 /*
 * general bitmap file operations
 */
@@ -649,19 +751,17 @@ out_no_sb:
 * file a page at a time. There's a superblock at the start of the file.
 */
 /* calculate the index of the page that contains this bit */
-static inline unsigned long file_page_index(struct bitmap_storage *store,
+static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk)
-                                            unsigned long chunk)
 {
-        if (store->sb_page)
+        if (!bitmap->mddev->bitmap_info.external)
                chunk += sizeof(bitmap_super_t) << 3;
        return chunk >> PAGE_BIT_SHIFT;
 }
 /* calculate the (bit) offset of this bit within a page */
-static inline unsigned long file_page_offset(struct bitmap_storage *store,
+static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk)
-                                             unsigned long chunk)
 {
-        if (store->sb_page)
+        if (!bitmap->mddev->bitmap_info.external)
                chunk += sizeof(bitmap_super_t) << 3;
        return chunk & (PAGE_BITS - 1);
 }
@@ -673,86 +773,57 @@ static inline unsigned long file_page_offset(struct bitmap_storage *store,
 * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
 * 0 or page 1
 */
-static inline struct page *filemap_get_page(struct bitmap_storage *store,
+static inline struct page *filemap_get_page(struct bitmap *bitmap,
                                            unsigned long chunk)
 {
-        if (file_page_index(store, chunk) >= store->file_pages)
+        if (file_page_index(bitmap, chunk) >= bitmap->file_pages)
                return NULL;
-        return store->filemap[file_page_index(store, chunk)
+        return bitmap->filemap[file_page_index(bitmap, chunk)
-                              - file_page_index(store, 0)];
+                               - file_page_index(bitmap, 0)];
-}
-static int bitmap_storage_alloc(struct bitmap_storage *store,
-                                unsigned long chunks, int with_super)
-{
-        int pnum;
-        unsigned long num_pages;
-        unsigned long bytes;
-        bytes = DIV_ROUND_UP(chunks, 8);
-        if (with_super)
-                bytes += sizeof(bitmap_super_t);
-        num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
-        store->filemap = kmalloc(sizeof(struct page *)
-                                 * num_pages, GFP_KERNEL);
-        if (!store->filemap)
-                return -ENOMEM;
-        if (with_super && !store->sb_page) {
-                store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
-                if (store->sb_page == NULL)
-                        return -ENOMEM;
-                store->sb_page->index = 0;
-        }
-        pnum = 0;
-        if (store->sb_page) {
-                store->filemap[0] = store->sb_page;
-                pnum = 1;
-        }
-        for ( ; pnum < num_pages; pnum++) {
-                store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
-                if (!store->filemap[pnum]) {
-                        store->file_pages = pnum;
-                        return -ENOMEM;
-                }
-                store->filemap[pnum]->index = pnum;
-        }
-        store->file_pages = pnum;
-        /* We need 4 bits per page, rounded up to a multiple
-         * of sizeof(unsigned long) */
-        store->filemap_attr = kzalloc(
-                roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
-                GFP_KERNEL);
-        if (!store->filemap_attr)
-                return -ENOMEM;
-        store->bytes = bytes;
-        return 0;
 }
-static void bitmap_file_unmap(struct bitmap_storage *store)
+static void bitmap_file_unmap(struct bitmap *bitmap)
 {
        struct page **map, *sb_page;
+        unsigned long *attr;
        int pages;
-        struct file *file;
+        unsigned long flags;
-        file = store->file;
+        spin_lock_irqsave(&bitmap->lock, flags);
-        map = store->filemap;
+        map = bitmap->filemap;
-        pages = store->file_pages;
+        bitmap->filemap = NULL;
-        sb_page = store->sb_page;
+        attr = bitmap->filemap_attr;
+        bitmap->filemap_attr = NULL;
+        pages = bitmap->file_pages;
+        bitmap->file_pages = 0;
+        sb_page = bitmap->sb_page;
+        bitmap->sb_page = NULL;
+        spin_unlock_irqrestore(&bitmap->lock, flags);
        while (pages--)
                if (map[pages] != sb_page) /* 0 is sb_page, release it below */
                        free_buffers(map[pages]);
        kfree(map);
-        kfree(store->filemap_attr);
+        kfree(attr);
        if (sb_page)
                free_buffers(sb_page);
+}
+static void bitmap_file_put(struct bitmap *bitmap)
+{
+        struct file *file;
+        unsigned long flags;
+        spin_lock_irqsave(&bitmap->lock, flags);
+        file = bitmap->file;
+        bitmap->file = NULL;
+        spin_unlock_irqrestore(&bitmap->lock, flags);
+        if (file)
+                wait_event(bitmap->write_wait,
+                           atomic_read(&bitmap->pending_writes)==0);
+        bitmap_file_unmap(bitmap);
        if (file) {
                struct inode *inode = file->f_path.dentry->d_inode;
@@ -770,14 +841,14 @@ static void bitmap_file_kick(struct bitmap *bitmap)
 {
        char *path, *ptr = NULL;
-        if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
+        if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) {
                bitmap_update_sb(bitmap);
-                if (bitmap->storage.file) {
+                if (bitmap->file) {
                        path = kmalloc(PAGE_SIZE, GFP_KERNEL);
                        if (path)
-                                ptr = d_path(&bitmap->storage.file->f_path,
+                                ptr = d_path(&bitmap->file->f_path, path,
-                                             path, PAGE_SIZE);
+                                             PAGE_SIZE);
                        printk(KERN_ALERT
                              "%s: kicking failed bitmap file %s from array!\n",
@@ -789,39 +860,36 @@ static void bitmap_file_kick(struct bitmap *bitmap)
                               "%s: disabling internal bitmap due to errors\n",
                               bmname(bitmap));
        }
+        bitmap_file_put(bitmap);
+        return;
 }
 enum bitmap_page_attr {
        BITMAP_PAGE_DIRTY = 0,     /* there are set bits that need to be synced */
-        BITMAP_PAGE_PENDING = 1,   /* there are bits that are being cleaned.
+        BITMAP_PAGE_CLEAN = 1,     /* there are bits that might need to be cleared */
-                                    * i.e. counter is 1 or 2. */
        BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */
 };
-static inline void set_page_attr(struct bitmap *bitmap, int pnum,
+static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
-                                 enum bitmap_page_attr attr)
+                                enum bitmap_page_attr attr)
-{
-        set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
-}
-static inline void clear_page_attr(struct bitmap *bitmap, int pnum,
-                                   enum bitmap_page_attr attr)
 {
-        clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
+        __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
 }
-static inline int test_page_attr(struct bitmap *bitmap, int pnum,
+static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
-                                 enum bitmap_page_attr attr)
+                                enum bitmap_page_attr attr)
 {
-        return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
+        __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
 }
-static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum,
+static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
                                           enum bitmap_page_attr attr)
 {
-        return test_and_clear_bit((pnum<<2) + attr,
+        return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
-                                  bitmap->storage.filemap_attr);
 }
 /*
 * bitmap_file_set_bit -- called before performing a write to the md device
 * to set (and eventually sync) a particular bit in the bitmap file
@@ -834,46 +902,26 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
        unsigned long bit;
        struct page *page;
        void *kaddr;
-        unsigned long chunk = block >> bitmap->counts.chunkshift;
+        unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
+        if (!bitmap->filemap)
+                return;
-        page = filemap_get_page(&bitmap->storage, chunk);
+        page = filemap_get_page(bitmap, chunk);
        if (!page)
                return;
-        bit = file_page_offset(&bitmap->storage, chunk);
+        bit = file_page_offset(bitmap, chunk);
        /* set the bit */
-        kaddr = kmap_atomic(page);
+        kaddr = kmap_atomic(page, KM_USER0);
-        if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
+        if (bitmap->flags & BITMAP_HOSTENDIAN)
                set_bit(bit, kaddr);
        else
-                test_and_set_bit_le(bit, kaddr);
+                __set_bit_le(bit, kaddr);
-        kunmap_atomic(kaddr);
+        kunmap_atomic(kaddr, KM_USER0);
-        pr_debug("set file bit %lu page %lu\n", bit, page->index);
+        PRINTK("set file bit %lu page %lu\n", bit, page->index);
        /* record page number so it gets flushed to disk when unplug occurs */
-        set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY);
+        set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
-}
-static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
-{
-        unsigned long bit;
-        struct page *page;
-        void *paddr;
-        unsigned long chunk = block >> bitmap->counts.chunkshift;
-        page = filemap_get_page(&bitmap->storage, chunk);
-        if (!page)
-                return;
-        bit = file_page_offset(&bitmap->storage, chunk);
-        paddr = kmap_atomic(page);
-        if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
-                clear_bit(bit, paddr);
-        else
-                test_and_clear_bit_le(bit, paddr);
-        kunmap_atomic(paddr);
-        if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) {
-                set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING);
-                bitmap->allclean = 0;
-        }
 }
 /* this gets called when the md device is ready to unplug its underlying
@@ -881,37 +929,42 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 * sync the dirty pages of the bitmap file to disk */
 void bitmap_unplug(struct bitmap *bitmap)
 {
-        unsigned long i;
+        unsigned long i, flags;
        int dirty, need_write;
+        struct page *page;
        int wait = 0;
-        if (!bitmap || !bitmap->storage.filemap ||
+        if (!bitmap)
-            test_bit(BITMAP_STALE, &bitmap->flags))
                return;
        /* look at each page to see if there are any set bits that need to be
         * flushed out to disk */
-        for (i = 0; i < bitmap->storage.file_pages; i++) {
+        for (i = 0; i < bitmap->file_pages; i++) {
-                if (!bitmap->storage.filemap)
+                spin_lock_irqsave(&bitmap->lock, flags);
+                if (!bitmap->filemap) {
+                        spin_unlock_irqrestore(&bitmap->lock, flags);
                        return;
-                dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
-                need_write = test_and_clear_page_attr(bitmap, i,
-                                                      BITMAP_PAGE_NEEDWRITE);
-                if (dirty || need_write) {
-                        clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
-                        write_page(bitmap, bitmap->storage.filemap[i], 0);
                }
+                page = bitmap->filemap[i];
+                dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
+                need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
+                clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
+                clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
                if (dirty)
                        wait = 1;
+                spin_unlock_irqrestore(&bitmap->lock, flags);
+                if (dirty || need_write)
+                        write_page(bitmap, page, 0);
        }
        if (wait) { /* if any writes were performed, we need to wait on them */
-                if (bitmap->storage.file)
+                if (bitmap->file)
                        wait_event(bitmap->write_wait,
                                   atomic_read(&bitmap->pending_writes)==0);
                else
                        md_super_wait(bitmap->mddev);
        }
-        if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
+        if (bitmap->flags & BITMAP_WRITE_ERROR)
                bitmap_file_kick(bitmap);
 }
 EXPORT_SYMBOL(bitmap_unplug);
@@ -931,117 +984,149 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
 static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 {
        unsigned long i, chunks, index, oldindex, bit;
-        struct page *page = NULL;
+        struct page *page = NULL, *oldpage = NULL;
-        unsigned long bit_cnt = 0;
+        unsigned long num_pages, bit_cnt = 0;
        struct file *file;
-        unsigned long offset;
+        unsigned long bytes, offset;
        int outofdate;
        int ret = -ENOSPC;
        void *paddr;
-        struct bitmap_storage *store = &bitmap->storage;
-        chunks = bitmap->counts.chunks;
+        chunks = bitmap->chunks;
-        file = store->file;
+        file = bitmap->file;
-        if (!file && !bitmap->mddev->bitmap_info.offset) {
+        BUG_ON(!file && !bitmap->mddev->bitmap_info.offset);
-                /* No permanent bitmap - fill with '1s'. */
-                store->filemap = NULL;
-                store->file_pages = 0;
-                for (i = 0; i < chunks ; i++) {
-                        /* if the disk bit is set, set the memory bit */
-                        int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift)
-                                      >= start);
-                        bitmap_set_memory_bits(bitmap,
-                                               (sector_t)i << bitmap->counts.chunkshift,
-                                               needed);
-                }
-                return 0;
-        }
-        outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
+#ifdef INJECT_FAULTS_3
+        outofdate = 1;
+#else
+        outofdate = bitmap->flags & BITMAP_STALE;
+#endif
        if (outofdate)
                printk(KERN_INFO "%s: bitmap file is out of date, doing full "
                        "recovery\n", bmname(bitmap));
-        if (file && i_size_read(file->f_mapping->host) < store->bytes) {
+        bytes = DIV_ROUND_UP(bitmap->chunks, 8);
+        if (!bitmap->mddev->bitmap_info.external)
+                bytes += sizeof(bitmap_super_t);
+        num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
+        if (file && i_size_read(file->f_mapping->host) < bytes) {
                printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
-                       bmname(bitmap),
+                        bmname(bitmap),
-                       (unsigned long) i_size_read(file->f_mapping->host),
+                        (unsigned long) i_size_read(file->f_mapping->host),
-                       store->bytes);
+                        bytes);
                goto err;
        }
+        ret = -ENOMEM;
+        bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
+        if (!bitmap->filemap)
+                goto err;
+        /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */
+        bitmap->filemap_attr = kzalloc(
+                roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
+                GFP_KERNEL);
+        if (!bitmap->filemap_attr)
+                goto err;
        oldindex = ~0L;
-        offset = 0;
-        if (!bitmap->mddev->bitmap_info.external)
-                offset = sizeof(bitmap_super_t);
        for (i = 0; i < chunks; i++) {
                int b;
-                index = file_page_index(&bitmap->storage, i);
+                index = file_page_index(bitmap, i);
-                bit = file_page_offset(&bitmap->storage, i);
+                bit = file_page_offset(bitmap, i);
                if (index != oldindex) { /* this is a new page, read it in */
                        int count;
                        /* unmap the old page, we're done with it */
-                        if (index == store->file_pages-1)
+                        if (index == num_pages-1)
-                                count = store->bytes - index * PAGE_SIZE;
+                                count = bytes - index * PAGE_SIZE;
                        else
                                count = PAGE_SIZE;
-                        page = store->filemap[index];
+                        if (index == 0 && bitmap->sb_page) {
-                        if (file)
+                                /*
-                                ret = read_page(file, index, bitmap,
+                                 * if we're here then the superblock page
-                                                count, page);
+                                 * contains some bits (PAGE_SIZE != sizeof sb)
-                        else
+                                 * we've already read it in, so just use it
-                                ret = read_sb_page(
+                                 */
-                                        bitmap->mddev,
+                                page = bitmap->sb_page;
-                                        bitmap->mddev->bitmap_info.offset,
+                                offset = sizeof(bitmap_super_t);
-                                        page,
+                                if (!file)
-                                        index, count);
+                                        page = read_sb_page(
+                                                bitmap->mddev,
-                        if (ret)
+                                                bitmap->mddev->bitmap_info.offset,
+                                                page,
+                                                index, count);
+                        } else if (file) {
+                                page = read_page(file, index, bitmap, count);
+                                offset = 0;
+                        } else {
+                                page = read_sb_page(bitmap->mddev,
+                                                    bitmap->mddev->bitmap_info.offset,
+                                                    NULL,
+                                                    index, count);
+                                offset = 0;
+                        }
+                        if (IS_ERR(page)) { /* read error */
+                                ret = PTR_ERR(page);
                                goto err;
+                        }
                        oldindex = index;
+                        oldpage = page;
+                        bitmap->filemap[bitmap->file_pages++] = page;
+                        bitmap->last_page_size = count;
                        if (outofdate) {
                                /*
                                 * if bitmap is out of date, dirty the
                                 * whole page and write it out
                                 */
-                                paddr = kmap_atomic(page);
+                                paddr = kmap_atomic(page, KM_USER0);
                                memset(paddr + offset, 0xff,
                                       PAGE_SIZE - offset);
-                                kunmap_atomic(paddr);
+                                kunmap_atomic(paddr, KM_USER0);
                                write_page(bitmap, page, 1);
                                ret = -EIO;
-                                if (test_bit(BITMAP_WRITE_ERROR,
+                                if (bitmap->flags & BITMAP_WRITE_ERROR)
-                                             &bitmap->flags))
                                        goto err;
                        }
                }
-                paddr = kmap_atomic(page);
+                paddr = kmap_atomic(page, KM_USER0);
-                if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
+                if (bitmap->flags & BITMAP_HOSTENDIAN)
                        b = test_bit(bit, paddr);
                else
                        b = test_bit_le(bit, paddr);
-                kunmap_atomic(paddr);
+                kunmap_atomic(paddr, KM_USER0);
                if (b) {
                        /* if the disk bit is set, set the memory bit */
-                        int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
+                        int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap))
                                      >= start);
                        bitmap_set_memory_bits(bitmap,
-                                               (sector_t)i << bitmap->counts.chunkshift,
+                                               (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
                                               needed);
                        bit_cnt++;
+                        set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
                }
-                offset = 0;
+        }
+        /* everything went OK */
+        ret = 0;
+        bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
+        if (bit_cnt) { /* Kick recovery if any bits were set */
+                set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
+                md_wakeup_thread(bitmap->mddev->thread);
        }
        printk(KERN_INFO "%s: bitmap initialized from disk: "
-               "read %lu pages, set %lu of %lu bits\n",
+               "read %lu/%lu pages, set %lu of %lu bits\n",
-               bmname(bitmap), store->file_pages,
+               bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks);
-               bit_cnt, chunks);
        return 0;
@@ -1058,38 +1143,19 @@ void bitmap_write_all(struct bitmap *bitmap)
         */
        int i;
-        if (!bitmap || !bitmap->storage.filemap)
+        for (i = 0; i < bitmap->file_pages; i++)
-                return;
+                set_page_attr(bitmap, bitmap->filemap[i],
-        if (bitmap->storage.file)
-                /* Only one copy, so nothing needed */
-                return;
-        for (i = 0; i < bitmap->storage.file_pages; i++)
-                set_page_attr(bitmap, i,
                              BITMAP_PAGE_NEEDWRITE);
-        bitmap->allclean = 0;
 }
-static void bitmap_count_page(struct bitmap_counts *bitmap,
+static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
-                              sector_t offset, int inc)
 {
-        sector_t chunk = offset >> bitmap->chunkshift;
+        sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
        unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
        bitmap->bp[page].count += inc;
        bitmap_checkfree(bitmap, page);
 }
+static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
-static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
-{
-        sector_t chunk = offset >> bitmap->chunkshift;
-        unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
-        struct bitmap_page *bp = &bitmap->bp[page];
-        if (!bp->pending)
-                bp->pending = 1;
-}
-static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
                                            sector_t offset, sector_t *blocks,
                                            int create);
@@ -1098,13 +1164,14 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
 *                      out to disk
 */
-void bitmap_daemon_work(struct mddev *mddev)
+void bitmap_daemon_work(mddev_t *mddev)
 {
        struct bitmap *bitmap;
        unsigned long j;
-        unsigned long nextpage;
+        unsigned long flags;
+        struct page *page = NULL, *lastpage = NULL;
        sector_t blocks;
-        struct bitmap_counts *counts;
+        void *paddr;
        /* Use a mutex to guard daemon_work against
         * bitmap_destroy.
@@ -1116,111 +1183,129 @@ void bitmap_daemon_work(struct mddev *mddev)
                return;
        }
        if (time_before(jiffies, bitmap->daemon_lastrun
-                        + mddev->bitmap_info.daemon_sleep))
+                        + bitmap->mddev->bitmap_info.daemon_sleep))
                goto done;
        bitmap->daemon_lastrun = jiffies;
        if (bitmap->allclean) {
-                mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+                bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
                goto done;
        }
        bitmap->allclean = 1;
-        /* Any file-page which is PENDING now needs to be written.
+        spin_lock_irqsave(&bitmap->lock, flags);
-         * So set NEEDWRITE now, then after we make any last-minute changes
+        for (j = 0; j < bitmap->chunks; j++) {
-         * we will write it.
-         */
-        for (j = 0; j < bitmap->storage.file_pages; j++)
-                if (test_and_clear_page_attr(bitmap, j,
-                                             BITMAP_PAGE_PENDING))
-                        set_page_attr(bitmap, j,
-                                      BITMAP_PAGE_NEEDWRITE);
-        if (bitmap->need_sync &&
-            mddev->bitmap_info.external == 0) {
-                /* Arrange for superblock update as well as
-                 * other changes */
-                bitmap_super_t *sb;
-                bitmap->need_sync = 0;
-                if (bitmap->storage.filemap) {
-                        sb = kmap_atomic(bitmap->storage.sb_page);
-                        sb->events_cleared =
-                                cpu_to_le64(bitmap->events_cleared);
-                        kunmap_atomic(sb);
-                        set_page_attr(bitmap, 0,
-                                      BITMAP_PAGE_NEEDWRITE);
-                }
-        }
-        /* Now look at the bitmap counters and if any are '2' or '1',
-         * decrement and handle accordingly.
-         */
-        counts = &bitmap->counts;
-        spin_lock_irq(&counts->lock);
-        nextpage = 0;
-        for (j = 0; j < counts->chunks; j++) {
                bitmap_counter_t *bmc;
-                sector_t  block = (sector_t)j << counts->chunkshift;
+                if (!bitmap->filemap)
+                        /* error or shutdown */
+                        break;
+                page = filemap_get_page(bitmap, j);
+                if (page != lastpage) {
+                        /* skip this page unless it's marked as needing cleaning */
+                        if (!test_page_attr(bitmap, page, BITMAP_PAGE_CLEAN)) {
+                                int need_write = test_page_attr(bitmap, page,
+                                                                BITMAP_PAGE_NEEDWRITE);
+                                if (need_write)
+                                        clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
-                if (j == nextpage) {
+                                spin_unlock_irqrestore(&bitmap->lock, flags);
-                        nextpage += PAGE_COUNTER_RATIO;
+                                if (need_write) {
-                        if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) {
+                                        write_page(bitmap, page, 0);
-                                j |= PAGE_COUNTER_MASK;
+                                        bitmap->allclean = 0;
+                                }
+                                spin_lock_irqsave(&bitmap->lock, flags);
+                                j |= (PAGE_BITS - 1);
                                continue;
                        }
-                        counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
+                        /* grab the new page, sync and release the old */
+                        if (lastpage != NULL) {
+                                if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
+                                        clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
+                                        spin_unlock_irqrestore(&bitmap->lock, flags);
+                                        write_page(bitmap, lastpage, 0);
+                                } else {
+                                        set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
+                                        spin_unlock_irqrestore(&bitmap->lock, flags);
+                                }
+                        } else
+                                spin_unlock_irqrestore(&bitmap->lock, flags);
+                        lastpage = page;
+                        /* We are possibly going to clear some bits, so make
+                         * sure that events_cleared is up-to-date.
+                         */
+                        if (bitmap->need_sync &&
+                            bitmap->mddev->bitmap_info.external == 0) {
+                                bitmap_super_t *sb;
+                                bitmap->need_sync = 0;
+                                sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+                                sb->events_cleared =
+                                        cpu_to_le64(bitmap->events_cleared);
+                                kunmap_atomic(sb, KM_USER0);
+                                write_page(bitmap, bitmap->sb_page, 1);
+                        }
+                        spin_lock_irqsave(&bitmap->lock, flags);
+                        if (!bitmap->need_sync)
+                                clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
                }
-                bmc = bitmap_get_counter(counts,
+                bmc = bitmap_get_counter(bitmap,
-                                         block,
+                                         (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
                                         &blocks, 0);
+                if (bmc) {
+                        if (*bmc)
+                                bitmap->allclean = 0;
-                if (!bmc) {
+                        if (*bmc == 2) {
+                                *bmc = 1; /* maybe clear the bit next time */
+                                set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
+                        } else if (*bmc == 1 && !bitmap->need_sync) {
+                                /* we can clear the bit */
+                                *bmc = 0;
+                                bitmap_count_page(bitmap,
+                                                  (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
+                                                  -1);
+                                /* clear the bit */
+                                paddr = kmap_atomic(page, KM_USER0);
+                                if (bitmap->flags & BITMAP_HOSTENDIAN)
+                                        clear_bit(file_page_offset(bitmap, j),
+                                                  paddr);
+                                else
+                                        __clear_bit_le(
+                                                        file_page_offset(bitmap,
+                                                                         j),
+                                                        paddr);
+                                kunmap_atomic(paddr, KM_USER0);
+                        }
+                } else
                        j |= PAGE_COUNTER_MASK;
-                        continue;
-                }
-                if (*bmc == 1 && !bitmap->need_sync) {
-                        /* We can clear the bit */
-                        *bmc = 0;
-                        bitmap_count_page(counts, block, -1);
-                        bitmap_file_clear_bit(bitmap, block);
-                } else if (*bmc && *bmc <= 2) {
-                        *bmc = 1;
-                        bitmap_set_pending(counts, block);
-                        bitmap->allclean = 0;
-                }
        }
-        spin_unlock_irq(&counts->lock);
+        spin_unlock_irqrestore(&bitmap->lock, flags);
-        /* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
+        /* now sync the final page */
-         * DIRTY pages need to be written by bitmap_unplug so it can wait
+        if (lastpage != NULL) {
-         * for them.
+                spin_lock_irqsave(&bitmap->lock, flags);
-         * If we find any DIRTY page we stop there and let bitmap_unplug
+                if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
-         * handle all the rest.  This is important in the case where
+                        clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
-         * the first blocking holds the superblock and it has been updated.
+                        spin_unlock_irqrestore(&bitmap->lock, flags);
-         * We mustn't write any other blocks before the superblock.
+                        write_page(bitmap, lastpage, 0);
-         */
+                } else {
-        for (j = 0;
+                        set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
-             j < bitmap->storage.file_pages
+                        spin_unlock_irqrestore(&bitmap->lock, flags);
-                     && !test_bit(BITMAP_STALE, &bitmap->flags);
-             j++) {
-                if (test_page_attr(bitmap, j,
-                                   BITMAP_PAGE_DIRTY))
-                        /* bitmap_unplug will handle the rest */
-                        break;
-                if (test_and_clear_page_attr(bitmap, j,
-                                             BITMAP_PAGE_NEEDWRITE)) {
-                        write_page(bitmap, bitmap->storage.filemap[j], 0);
                }
        }
 done:
        if (bitmap->allclean == 0)
-                mddev->thread->timeout =
+                bitmap->mddev->thread->timeout =
-                        mddev->bitmap_info.daemon_sleep;
+                        bitmap->mddev->bitmap_info.daemon_sleep;
        mutex_unlock(&mddev->bitmap_info.mutex);
 }
-static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
+static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
                                            sector_t offset, sector_t *blocks,
                                            int create)
 __releases(bitmap->lock)
@@ -1230,7 +1315,7 @@ __acquires(bitmap->lock)
         * The lock must have been taken with interrupts enabled.
         * If !create, we don't release the lock.
         */
-        sector_t chunk = offset >> bitmap->chunkshift;
+        sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
        unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
        unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
        sector_t csize;
@@ -1240,10 +1325,10 @@ __acquires(bitmap->lock)
        if (bitmap->bp[page].hijacked ||
            bitmap->bp[page].map == NULL)
-                csize = ((sector_t)1) << (bitmap->chunkshift +
+                csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) +
                                          PAGE_COUNTER_SHIFT - 1);
        else
-                csize = ((sector_t)1) << bitmap->chunkshift;
+                csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
        *blocks = csize - (offset & (csize - 1));
        if (err < 0)
@@ -1274,18 +1359,18 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
                if (bw > bitmap->behind_writes_used)
                        bitmap->behind_writes_used = bw;
-                pr_debug("inc write-behind count %d/%lu\n",
+                PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
-                         bw, bitmap->mddev->bitmap_info.max_write_behind);
+                       bw, bitmap->max_write_behind);
        }
        while (sectors) {
                sector_t blocks;
                bitmap_counter_t *bmc;
-                spin_lock_irq(&bitmap->counts.lock);
+                spin_lock_irq(&bitmap->lock);
-                bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1);
+                bmc = bitmap_get_counter(bitmap, offset, &blocks, 1);
                if (!bmc) {
-                        spin_unlock_irq(&bitmap->counts.lock);
+                        spin_unlock_irq(&bitmap->lock);
                        return 0;
                }
@@ -1297,8 +1382,8 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
                         */
                        prepare_to_wait(&bitmap->overflow_wait, &__wait,
                                        TASK_UNINTERRUPTIBLE);
-                        spin_unlock_irq(&bitmap->counts.lock);
+                        spin_unlock_irq(&bitmap->lock);
-                        schedule();
+                        io_schedule();
                        finish_wait(&bitmap->overflow_wait, &__wait);
                        continue;
                }
@@ -1306,7 +1391,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
                switch (*bmc) {
                case 0:
                        bitmap_file_set_bit(bitmap, offset);
-                        bitmap_count_page(&bitmap->counts, offset, 1);
+                        bitmap_count_page(bitmap, offset, 1);
                        /* fall through */
                case 1:
                        *bmc = 2;
@@ -1314,7 +1399,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
                (*bmc)++;
-                spin_unlock_irq(&bitmap->counts.lock);
+                spin_unlock_irq(&bitmap->lock);
                offset += blocks;
                if (sectors > blocks)
@@ -1322,6 +1407,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
                else
                        sectors = 0;
        }
+        bitmap->allclean = 0;
        return 0;
 }
 EXPORT_SYMBOL(bitmap_startwrite);
@@ -1334,24 +1420,26 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
        if (behind) {
                if (atomic_dec_and_test(&bitmap->behind_writes))
                        wake_up(&bitmap->behind_wait);
-                pr_debug("dec write-behind count %d/%lu\n",
+                PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
-                         atomic_read(&bitmap->behind_writes),
+                  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
-                         bitmap->mddev->bitmap_info.max_write_behind);
        }
+        if (bitmap->mddev->degraded)
+                /* Never clear bits or update events_cleared when degraded */
+                success = 0;
        while (sectors) {
                sector_t blocks;
                unsigned long flags;
                bitmap_counter_t *bmc;
-                spin_lock_irqsave(&bitmap->counts.lock, flags);
+                spin_lock_irqsave(&bitmap->lock, flags);
-                bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0);
+                bmc = bitmap_get_counter(bitmap, offset, &blocks, 0);
                if (!bmc) {
-                        spin_unlock_irqrestore(&bitmap->counts.lock, flags);
+                        spin_unlock_irqrestore(&bitmap->lock, flags);
                        return;
                }
-                if (success && !bitmap->mddev->degraded &&
+                if (success &&
                    bitmap->events_cleared < bitmap->mddev->events) {
                        bitmap->events_cleared = bitmap->mddev->events;
                        bitmap->need_sync = 1;
@@ -1365,11 +1453,14 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
                        wake_up(&bitmap->overflow_wait);
                (*bmc)--;
-                if (*bmc <= 2) {
+                if (*bmc <= 2)
-                        bitmap_set_pending(&bitmap->counts, offset);
+                        set_page_attr(bitmap,
-                        bitmap->allclean = 0;
+                                      filemap_get_page(
-                }
+                                              bitmap,
-                spin_unlock_irqrestore(&bitmap->counts.lock, flags);
+                                              offset >> CHUNK_BLOCK_SHIFT(bitmap)),
+                                      BITMAP_PAGE_CLEAN);
+                spin_unlock_irqrestore(&bitmap->lock, flags);
                offset += blocks;
                if (sectors > blocks)
                        sectors -= blocks;
@@ -1388,8 +1479,8 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
                *blocks = 1024;
                return 1; /* always resync if no bitmap */
        }
-        spin_lock_irq(&bitmap->counts.lock);
+        spin_lock_irq(&bitmap->lock);
-        bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
+        bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
        rv = 0;
        if (bmc) {
                /* locked */
@@ -1403,7 +1494,8 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
                        }
                }
        }
-        spin_unlock_irq(&bitmap->counts.lock);
+        spin_unlock_irq(&bitmap->lock);
+        bitmap->allclean = 0;
        return rv;
 }
@@ -1440,8 +1532,8 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
                *blocks = 1024;
                return;
        }
-        spin_lock_irqsave(&bitmap->counts.lock, flags);
+        spin_lock_irqsave(&bitmap->lock, flags);
-        bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
+        bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
        if (bmc == NULL)
                goto unlock;
        /* locked */
@@ -1451,14 +1543,15 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
                if (!NEEDED(*bmc) && aborted)
                        *bmc |= NEEDED_MASK;
                else {
-                        if (*bmc <= 2) {
+                        if (*bmc <= 2)
-                                bitmap_set_pending(&bitmap->counts, offset);
+                                set_page_attr(bitmap,
-                                bitmap->allclean = 0;
+                                              filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
-                        }
+                                              BITMAP_PAGE_CLEAN);
                }
        }
 unlock:
-        spin_unlock_irqrestore(&bitmap->counts.lock, flags);
+        spin_unlock_irqrestore(&bitmap->lock, flags);
+        bitmap->allclean = 0;
 }
 EXPORT_SYMBOL(bitmap_end_sync);
@@ -1498,7 +1591,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
        bitmap->mddev->curr_resync_completed = sector;
        set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
-        sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
+        sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
        s = 0;
        while (s < sector && s < bitmap->mddev->resync_max_sectors) {
                bitmap_end_sync(bitmap, s, &blocks, 0);
@@ -1512,25 +1605,27 @@ EXPORT_SYMBOL(bitmap_cond_end_sync);
 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
 {
        /* For each chunk covered by any of these sectors, set the
-         * counter to 2 and possibly set resync_needed.  They should all
+         * counter to 1 and set resync_needed.  They should all
         * be 0 at this point
         */
        sector_t secs;
        bitmap_counter_t *bmc;
-        spin_lock_irq(&bitmap->counts.lock);
+        spin_lock_irq(&bitmap->lock);
-        bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1);
+        bmc = bitmap_get_counter(bitmap, offset, &secs, 1);
        if (!bmc) {
-                spin_unlock_irq(&bitmap->counts.lock);
+                spin_unlock_irq(&bitmap->lock);
                return;
        }
        if (!*bmc) {
-                *bmc = 2 | (needed ? NEEDED_MASK : 0);
+                struct page *page;
-                bitmap_count_page(&bitmap->counts, offset, 1);
+                *bmc = 1 | (needed ? NEEDED_MASK : 0);
-                bitmap_set_pending(&bitmap->counts, offset);
+                bitmap_count_page(bitmap, offset, 1);
-                bitmap->allclean = 0;
+                page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
+                set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
        }
-        spin_unlock_irq(&bitmap->counts.lock);
+        spin_unlock_irq(&bitmap->lock);
+        bitmap->allclean = 0;
 }
 /* dirty the memory and file bits for bitmap chunks "s" to "e" */
@@ -1539,7 +1634,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
        unsigned long chunk;
        for (chunk = s; chunk <= e; chunk++) {
-                sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift;
+                sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
                bitmap_set_memory_bits(bitmap, sec, 1);
                bitmap_file_set_bit(bitmap, sec);
                if (sec < bitmap->mddev->recovery_cp)
@@ -1554,7 +1649,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
 /*
 * flush out any pending updates
 */
-void bitmap_flush(struct mddev *mddev)
+void bitmap_flush(mddev_t *mddev)
 {
        struct bitmap *bitmap = mddev->bitmap;
        long sleep;
@@ -1586,15 +1681,11 @@ static void bitmap_free(struct bitmap *bitmap)
        if (!bitmap) /* there was no bitmap */
                return;
-        /* Shouldn't be needed - but just in case.... */
+        /* release the bitmap file and kill the daemon */
-        wait_event(bitmap->write_wait,
+        bitmap_file_put(bitmap);
-                   atomic_read(&bitmap->pending_writes) == 0);
-        /* release the bitmap file  */
-        bitmap_file_unmap(&bitmap->storage);
-        bp = bitmap->counts.bp;
+        bp = bitmap->bp;
-        pages = bitmap->counts.pages;
+        pages = bitmap->pages;
        /* free all allocated memory */
@@ -1606,7 +1697,7 @@ static void bitmap_free(struct bitmap *bitmap)
        kfree(bitmap);
 }
-void bitmap_destroy(struct mddev *mddev)
+void bitmap_destroy(mddev_t *mddev)
 {
        struct bitmap *bitmap = mddev->bitmap;
@@ -1629,23 +1720,29 @@ void bitmap_destroy(struct mddev *mddev)
 * initialize the bitmap structure
 * if this returns an error, bitmap_destroy must be called to do clean up
 */
-int bitmap_create(struct mddev *mddev)
+int bitmap_create(mddev_t *mddev)
 {
        struct bitmap *bitmap;
        sector_t blocks = mddev->resync_max_sectors;
+        unsigned long chunks;
+        unsigned long pages;
        struct file *file = mddev->bitmap_info.file;
        int err;
        struct sysfs_dirent *bm = NULL;
        BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
+        if (!file
+            && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
+                return 0;
        BUG_ON(file && mddev->bitmap_info.offset);
        bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
        if (!bitmap)
                return -ENOMEM;
-        spin_lock_init(&bitmap->counts.lock);
+        spin_lock_init(&bitmap->lock);
        atomic_set(&bitmap->pending_writes, 0);
        init_waitqueue_head(&bitmap->write_wait);
        init_waitqueue_head(&bitmap->overflow_wait);
@@ -1661,7 +1758,7 @@ int bitmap_create(struct mddev *mddev)
        } else
                bitmap->sysfs_can_clear = NULL;
-        bitmap->storage.file = file;
+        bitmap->file = file;
        if (file) {
                get_file(file);
                /* As future accesses to this file will use bmap,
@@ -1692,22 +1789,42 @@ int bitmap_create(struct mddev *mddev)
                goto error;
        bitmap->daemon_lastrun = jiffies;
-        err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1);
+        bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize);
-        if (err)
+        /* now that chunksize and chunkshift are set, we can use these macros */
+        chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
+                        CHUNK_BLOCK_SHIFT(bitmap);
+        pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
+        BUG_ON(!pages);
+        bitmap->chunks = chunks;
+        bitmap->pages = pages;
+        bitmap->missing_pages = pages;
+#ifdef INJECT_FATAL_FAULT_1
+        bitmap->bp = NULL;
+#else
+        bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
+#endif
+        err = -ENOMEM;
+        if (!bitmap->bp)
                goto error;
        printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
-               bitmap->counts.pages, bmname(bitmap));
+                pages, bmname(bitmap));
        mddev->bitmap = bitmap;
-        return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
+        return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0;
 error:
        bitmap_free(bitmap);
        return err;
 }
-int bitmap_load(struct mddev *mddev)
+int bitmap_load(mddev_t *mddev)
 {
        int err = 0;
        sector_t start = 0;
@@ -1735,222 +1852,25 @@ int bitmap_load(struct mddev *mddev)
                 * re-add of a missing device */
                start = mddev->recovery_cp;
-        mutex_lock(&mddev->bitmap_info.mutex);
        err = bitmap_init_from_disk(bitmap, start);
-        mutex_unlock(&mddev->bitmap_info.mutex);
        if (err)
                goto out;
-        clear_bit(BITMAP_STALE, &bitmap->flags);
-        /* Kick recovery in case any bits were set */
-        set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
        mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
        md_wakeup_thread(mddev->thread);
        bitmap_update_sb(bitmap);
-        if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
+        if (bitmap->flags & BITMAP_WRITE_ERROR)
                err = -EIO;
 out:
        return err;
 }
 EXPORT_SYMBOL_GPL(bitmap_load);
-void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
-{
-        unsigned long chunk_kb;
-        struct bitmap_counts *counts;
-        if (!bitmap)
-                return;
-        counts = &bitmap->counts;
-        chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
-        seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
-                   "%lu%s chunk",
-                   counts->pages - counts->missing_pages,
-                   counts->pages,
-                   (counts->pages - counts->missing_pages)
-                   << (PAGE_SHIFT - 10),
-                   chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
-                   chunk_kb ? "KB" : "B");
-        if (bitmap->storage.file) {
-                seq_printf(seq, ", file: ");
-                seq_path(seq, &bitmap->storage.file->f_path, " \t\n");
-        }
-        seq_printf(seq, "\n");
-}
-int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
-                  int chunksize, int init)
-{
-        /* If chunk_size is 0, choose an appropriate chunk size.
-         * Then possibly allocate new storage space.
-         * Then quiesce, copy bits, replace bitmap, and re-start
-         *
-         * This function is called both to set up the initial bitmap
-         * and to resize the bitmap while the array is active.
-         * If this happens as a result of the array being resized,
-         * chunksize will be zero, and we need to choose a suitable
-         * chunksize, otherwise we use what we are given.
-         */
-        struct bitmap_storage store;
-        struct bitmap_counts old_counts;
-        unsigned long chunks;
-        sector_t block;
-        sector_t old_blocks, new_blocks;
-        int chunkshift;
-        int ret = 0;
-        long pages;
-        struct bitmap_page *new_bp;
-        if (chunksize == 0) {
-                /* If there is enough space, leave the chunk size unchanged,
-                 * else increase by factor of two until there is enough space.
-                 */
-                long bytes;
-                long space = bitmap->mddev->bitmap_info.space;
-                if (space == 0) {
-                        /* We don't know how much space there is, so limit
-                         * to current size - in sectors.
-                         */
-                        bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8);
-                        if (!bitmap->mddev->bitmap_info.external)
-                                bytes += sizeof(bitmap_super_t);
-                        space = DIV_ROUND_UP(bytes, 512);
-                        bitmap->mddev->bitmap_info.space = space;
-                }
-                chunkshift = bitmap->counts.chunkshift;
-                chunkshift--;
-                do {
-                        /* 'chunkshift' is shift from block size to chunk size */
-                        chunkshift++;
-                        chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
-                        bytes = DIV_ROUND_UP(chunks, 8);
-                        if (!bitmap->mddev->bitmap_info.external)
-                                bytes += sizeof(bitmap_super_t);
-                } while (bytes > (space << 9));
-        } else
-                chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
-        chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
-        memset(&store, 0, sizeof(store));
-        if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
-                ret = bitmap_storage_alloc(&store, chunks,
-                                           !bitmap->mddev->bitmap_info.external);
-        if (ret)
-                goto err;
-        pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
-        new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL);
-        ret = -ENOMEM;
-        if (!new_bp) {
-                bitmap_file_unmap(&store);
-                goto err;
-        }
-        if (!init)
-                bitmap->mddev->pers->quiesce(bitmap->mddev, 1);
-        store.file = bitmap->storage.file;
-        bitmap->storage.file = NULL;
-        if (store.sb_page && bitmap->storage.sb_page)
-                memcpy(page_address(store.sb_page),
-                       page_address(bitmap->storage.sb_page),
-                       sizeof(bitmap_super_t));
-        bitmap_file_unmap(&bitmap->storage);
-        bitmap->storage = store;
-        old_counts = bitmap->counts;
-        bitmap->counts.bp = new_bp;
-        bitmap->counts.pages = pages;
-        bitmap->counts.missing_pages = pages;
-        bitmap->counts.chunkshift = chunkshift;
-        bitmap->counts.chunks = chunks;
-        bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift +
-                                                     BITMAP_BLOCK_SHIFT);
-        blocks = min(old_counts.chunks << old_counts.chunkshift,
-                     chunks << chunkshift);
-        spin_lock_irq(&bitmap->counts.lock);
-        for (block = 0; block < blocks; ) {
-                bitmap_counter_t *bmc_old, *bmc_new;
-                int set;
-                bmc_old = bitmap_get_counter(&old_counts, block,
-                                             &old_blocks, 0);
-                set = bmc_old && NEEDED(*bmc_old);
-                if (set) {
-                        bmc_new = bitmap_get_counter(&bitmap->counts, block,
-                                                     &new_blocks, 1);
-                        if (*bmc_new == 0) {
-                                /* need to set on-disk bits too. */
-                                sector_t end = block + new_blocks;
-                                sector_t start = block >> chunkshift;
-                                start <<= chunkshift;
-                                while (start < end) {
-                                        bitmap_file_set_bit(bitmap, block);
-                                        start += 1 << chunkshift;
-                                }
-                                *bmc_new = 2;
-                                bitmap_count_page(&bitmap->counts,
-                                                  block, 1);
-                                bitmap_set_pending(&bitmap->counts,
-                                                   block);
-                        }
-                        *bmc_new |= NEEDED_MASK;
-                        if (new_blocks < old_blocks)
-                                old_blocks = new_blocks;
-                }
-                block += old_blocks;
-        }
-        if (!init) {
-                int i;
-                while (block < (chunks << chunkshift)) {
-                        bitmap_counter_t *bmc;
-                        bmc = bitmap_get_counter(&bitmap->counts, block,
-                                                 &new_blocks, 1);
-                        if (bmc) {
-                                /* new space.  It needs to be resynced, so
-                                 * we set NEEDED_MASK.
-                                 */
-                                if (*bmc == 0) {
-                                        *bmc = NEEDED_MASK | 2;
-                                        bitmap_count_page(&bitmap->counts,
-                                                          block, 1);
-                                        bitmap_set_pending(&bitmap->counts,
-                                                           block);
-                                }
-                        }
-                        block += new_blocks;
-                }
-                for (i = 0; i < bitmap->storage.file_pages; i++)
-                        set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
-        }
-        spin_unlock_irq(&bitmap->counts.lock);
-        if (!init) {
-                bitmap_unplug(bitmap);
-                bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
-        }
-        ret = 0;
-err:
-        return ret;
-}
-EXPORT_SYMBOL_GPL(bitmap_resize);
 static ssize_t
-location_show(struct mddev *mddev, char *page)
+location_show(mddev_t *mddev, char *page)
 {
        ssize_t len;
        if (mddev->bitmap_info.file)
@@ -1964,7 +1884,7 @@ location_show(struct mddev *mddev, char *page)
 }
 static ssize_t
-location_store(struct mddev *mddev, const char *buf, size_t len)
+location_store(mddev_t *mddev, const char *buf, size_t len)
 {
        if (mddev->pers) {
@@ -2017,8 +1937,6 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
                        if (mddev->pers) {
                                mddev->pers->quiesce(mddev, 1);
                                rv = bitmap_create(mddev);
-                                if (!rv)
-                                        rv = bitmap_load(mddev);
                                if (rv) {
                                        bitmap_destroy(mddev);
                                        mddev->bitmap_info.offset = 0;
@@ -2042,45 +1960,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry bitmap_location =
 __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
-/* 'bitmap/space' is the space available at 'location' for the
- * bitmap.  This allows the kernel to know when it is safe to
- * resize the bitmap to match a resized array.
- */
-static ssize_t
-space_show(struct mddev *mddev, char *page)
-{
-        return sprintf(page, "%lu\n", mddev->bitmap_info.space);
-}
-static ssize_t
-space_store(struct mddev *mddev, const char *buf, size_t len)
-{
-        unsigned long sectors;
-        int rv;
-        rv = kstrtoul(buf, 10, &sectors);
-        if (rv)
-                return rv;
-        if (sectors == 0)
-                return -EINVAL;
-        if (mddev->bitmap &&
-            sectors < (mddev->bitmap->storage.bytes + 511) >> 9)
-                return -EFBIG; /* Bitmap is too big for this small space */
-        /* could make sure it isn't too big, but that isn't really
-         * needed - user-space should be careful.
-         */
-        mddev->bitmap_info.space = sectors;
-        return len;
-}
-static struct md_sysfs_entry bitmap_space =
-__ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store);
 static ssize_t
-timeout_show(struct mddev *mddev, char *page)
+timeout_show(mddev_t *mddev, char *page)
 {
        ssize_t len;
        unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ;
@@ -2094,7 +1975,7 @@ timeout_show(struct mddev *mddev, char *page)
 }
 static ssize_t
-timeout_store(struct mddev *mddev, const char *buf, size_t len)
+timeout_store(mddev_t *mddev, const char *buf, size_t len)
 {
        /* timeout can be set at any time */
        unsigned long timeout;
@@ -2130,13 +2011,13 @@ static struct md_sysfs_entry bitmap_timeout =
 __ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store);
 static ssize_t
-backlog_show(struct mddev *mddev, char *page)
+backlog_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind);
 }
 static ssize_t
-backlog_store(struct mddev *mddev, const char *buf, size_t len)
+backlog_store(mddev_t *mddev, const char *buf, size_t len)
 {
        unsigned long backlog;
        int rv = strict_strtoul(buf, 10, &backlog);
@@ -2152,13 +2033,13 @@ static struct md_sysfs_entry bitmap_backlog =
 __ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
 static ssize_t
-chunksize_show(struct mddev *mddev, char *page)
+chunksize_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize);
 }
 static ssize_t
-chunksize_store(struct mddev *mddev, const char *buf, size_t len)
+chunksize_store(mddev_t *mddev, const char *buf, size_t len)
 {
        /* Can only be changed when no bitmap is active */
        int rv;
@@ -2178,13 +2059,13 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry bitmap_chunksize =
 __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
-static ssize_t metadata_show(struct mddev *mddev, char *page)
+static ssize_t metadata_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%s\n", (mddev->bitmap_info.external
                                      ? "external" : "internal"));
 }
-static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
+static ssize_t metadata_store(mddev_t *mddev, const char *buf, size_t len)
 {
        if (mddev->bitmap ||
            mddev->bitmap_info.file ||
@@ -2202,7 +2083,7 @@ static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry bitmap_metadata =
 __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
-static ssize_t can_clear_show(struct mddev *mddev, char *page)
+static ssize_t can_clear_show(mddev_t *mddev, char *page)
 {
        int len;
        if (mddev->bitmap)
@@ -2213,7 +2094,7 @@ static ssize_t can_clear_show(struct mddev *mddev, char *page)
        return len;
 }
-static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len)
+static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len)
 {
        if (mddev->bitmap == NULL)
                return -ENOENT;
@@ -2232,7 +2113,7 @@ static struct md_sysfs_entry bitmap_can_clear =
 __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
 static ssize_t
-behind_writes_used_show(struct mddev *mddev, char *page)
+behind_writes_used_show(mddev_t *mddev, char *page)
 {
        if (mddev->bitmap == NULL)
                return sprintf(page, "0\n");
@@ -2241,7 +2122,7 @@ behind_writes_used_show(struct mddev *mddev, char *page)
 }
 static ssize_t
-behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len)
+behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len)
 {
        if (mddev->bitmap)
                mddev->bitmap->behind_writes_used = 0;
@@ -2254,7 +2135,6 @@ __ATTR(max_backlog_used, S_IRUGO | S_IWUSR,
 static struct attribute *md_bitmap_attrs[] = {
        &bitmap_location.attr,
-        &bitmap_space.attr,
        &bitmap_timeout.attr,
        &bitmap_backlog.attr,
        &bitmap_chunksize.attr,
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index df4aeb6ac6f..a28f2e5588c 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -13,6 +13,8 @@
 #define BITMAP_MAJOR_HI 4
 #define BITMAP_MAJOR_HOSTENDIAN 3
+#define BITMAP_MINOR 39
 /*
 * in-memory bitmap:
 *
@@ -99,8 +101,22 @@ typedef __u16 bitmap_counter_t;
 /* same, except a mask value for more efficient bitops */
 #define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)
+#define BITMAP_BLOCK_SIZE 512
 #define BITMAP_BLOCK_SHIFT 9
+/* how many blocks per chunk? (this is variable) */
+#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
+/* when hijacked, the counters and bits represent even larger "chunks" */
+/* there will be 1024 chunks represented by each counter in the page pointers */
+#define PAGEPTR_BLOCK_RATIO(bitmap) \
+                        (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
+#define PAGEPTR_BLOCK_SHIFT(bitmap) \
+                        (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
+#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
 #endif
 /*
@@ -111,9 +127,9 @@ typedef __u16 bitmap_counter_t;
 /* use these for bitmap->flags and bitmap->sb->state bit-fields */
 enum bitmap_state {
-        BITMAP_STALE       = 1,  /* the bitmap file is out of date or had -EIO */
+        BITMAP_STALE  = 0x002,  /* the bitmap file is out of date or had -EIO */
-        BITMAP_WRITE_ERROR = 2, /* A write error has occurred */
+        BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
-        BITMAP_HOSTENDIAN  =15,
+        BITMAP_HOSTENDIAN = 0x8000,
 };
 /* the superblock at the front of the bitmap file -- little endian */
@@ -128,10 +144,8 @@ typedef struct bitmap_super_s {
        __le32 chunksize;    /* 52  the bitmap chunk size in bytes */
        __le32 daemon_sleep; /* 56  seconds between disk flushes */
        __le32 write_behind; /* 60  number of outstanding write-behind writes */
-        __le32 sectors_reserved; /* 64 number of 512-byte sectors that are
-                                  * reserved for the bitmap. */
-        __u8  pad[256 - 68]; /* set to zero */
+        __u8  pad[256 - 64]; /* set to zero */
 } bitmap_super_t;
 /* notes:
@@ -162,48 +176,41 @@ struct bitmap_page {
         */
        unsigned int hijacked:1;
        /*
-         * If any counter in this page is '1' or '2' - and so could be
-         * cleared then that page is marked as 'pending'
-         */
-        unsigned int pending:1;
-        /*
         * count of dirty bits on the page
         */
-        unsigned int  count:30;
+        unsigned int  count:31;
+};
+/* keep track of bitmap file pages that have pending writes on them */
+struct page_list {
+        struct list_head list;
+        struct page *page;
 };
 /* the main bitmap structure - one per mddev */
 struct bitmap {
+        struct bitmap_page *bp;
+        unsigned long pages; /* total number of pages in the bitmap */
+        unsigned long missing_pages; /* number of pages not yet allocated */
+        mddev_t *mddev; /* the md device that the bitmap is for */
-        struct bitmap_counts {
+        /* bitmap chunksize -- how much data does each bit represent? */
-                spinlock_t lock;
+        unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
-                struct bitmap_page *bp;
+        unsigned long chunks; /* total number of data chunks for the array */
-                unsigned long pages;            /* total number of pages
-                                                 * in the bitmap */
-                unsigned long missing_pages;    /* number of pages
-                                                 * not yet allocated */
-                unsigned long chunkshift;       /* chunksize = 2^chunkshift
-                                                 * (for bitops) */
-                unsigned long chunks;           /* Total number of data
-                                                 * chunks for the array */
-        } counts;
-        struct mddev *mddev; /* the md device that the bitmap is for */
        __u64   events_cleared;
        int need_sync;
-        struct bitmap_storage {
+        /* bitmap spinlock */
-                struct file *file;              /* backing disk file */
+        spinlock_t lock;
-                struct page *sb_page;           /* cached copy of the bitmap
-                                                 * file superblock */
+        struct file *file; /* backing disk file */
-                struct page **filemap;          /* list of cache pages for
+        struct page *sb_page; /* cached copy of the bitmap file superblock */
-                                                 * the file */
+        struct page **filemap; /* list of cache pages for the file */
-                unsigned long *filemap_attr;    /* attributes associated
+        unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
-                                                 * w/ filemap pages */
+        unsigned long file_pages; /* number of pages in the file */
-                unsigned long file_pages;       /* number of pages in the file*/
+        int last_page_size; /* bytes in the last page */
-                unsigned long bytes;            /* total bytes in the bitmap */
-        } storage;
        unsigned long flags;
@@ -231,14 +238,13 @@ struct bitmap {
 /* the bitmap API */
 /* these are used only by md/bitmap */
-int  bitmap_create(struct mddev *mddev);
+int  bitmap_create(mddev_t *mddev);
-int bitmap_load(struct mddev *mddev);
+int bitmap_load(mddev_t *mddev);
-void bitmap_flush(struct mddev *mddev);
+void bitmap_flush(mddev_t *mddev);
-void bitmap_destroy(struct mddev *mddev);
+void bitmap_destroy(mddev_t *mddev);
 void bitmap_print_sb(struct bitmap *bitmap);
 void bitmap_update_sb(struct bitmap *bitmap);
-void bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
 int  bitmap_setallbits(struct bitmap *bitmap);
 void bitmap_write_all(struct bitmap *bitmap);
@@ -256,10 +262,7 @@ void bitmap_close_sync(struct bitmap *bitmap);
 void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
 void bitmap_unplug(struct bitmap *bitmap);
-void bitmap_daemon_work(struct mddev *mddev);
+void bitmap_daemon_work(mddev_t *mddev);
-int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
-                  int chunksize, int init);
 #endif
 #endif
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
deleted file mode 100644
index aefb78e3cbf..00000000000
--- a/drivers/md/dm-bio-prison.c
+++ /dev/null
@@ -1,390 +0,0 @@
-/*
- * Copyright (C) 2012 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#include "dm.h"
-#include "dm-bio-prison.h"
-#include <linux/spinlock.h>
-#include <linux/mempool.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-/*----------------------------------------------------------------*/
-struct dm_bio_prison_cell {
-        struct hlist_node list;
-        struct dm_bio_prison *prison;
-        struct dm_cell_key key;
-        struct bio *holder;
-        struct bio_list bios;
-};
-struct dm_bio_prison {
-        spinlock_t lock;
-        mempool_t *cell_pool;
-        unsigned nr_buckets;
-        unsigned hash_mask;
-        struct hlist_head *cells;
-};
-/*----------------------------------------------------------------*/
-static uint32_t calc_nr_buckets(unsigned nr_cells)
-{
-        uint32_t n = 128;
-        nr_cells /= 4;
-        nr_cells = min(nr_cells, 8192u);
-        while (n < nr_cells)
-                n <<= 1;
-        return n;
-}
-static struct kmem_cache *_cell_cache;
-/*
- * @nr_cells should be the number of cells you want in use _concurrently_.
- * Don't confuse it with the number of distinct keys.
- */
-struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
-{
-        unsigned i;
-        uint32_t nr_buckets = calc_nr_buckets(nr_cells);
-        size_t len = sizeof(struct dm_bio_prison) +
-                (sizeof(struct hlist_head) * nr_buckets);
-        struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
-        if (!prison)
-                return NULL;
-        spin_lock_init(&prison->lock);
-        prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
-        if (!prison->cell_pool) {
-                kfree(prison);
-                return NULL;
-        }
-        prison->nr_buckets = nr_buckets;
-        prison->hash_mask = nr_buckets - 1;
-        prison->cells = (struct hlist_head *) (prison + 1);
-        for (i = 0; i < nr_buckets; i++)
-                INIT_HLIST_HEAD(prison->cells + i);
-        return prison;
-}
-EXPORT_SYMBOL_GPL(dm_bio_prison_create);
-void dm_bio_prison_destroy(struct dm_bio_prison *prison)
-{
-        mempool_destroy(prison->cell_pool);
-        kfree(prison);
-}
-EXPORT_SYMBOL_GPL(dm_bio_prison_destroy);
-static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
-{
-        const unsigned long BIG_PRIME = 4294967291UL;
-        uint64_t hash = key->block * BIG_PRIME;
-        return (uint32_t) (hash & prison->hash_mask);
-}
-static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs)
-{
-               return (lhs->virtual == rhs->virtual) &&
-                       (lhs->dev == rhs->dev) &&
-                       (lhs->block == rhs->block);
-}
-static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
-                                                  struct dm_cell_key *key)
-{
-        struct dm_bio_prison_cell *cell;
-        struct hlist_node *tmp;
-        hlist_for_each_entry(cell, tmp, bucket, list)
-                if (keys_equal(&cell->key, key))
-                        return cell;
-        return NULL;
-}
-/*
- * This may block if a new cell needs allocating.  You must ensure that
- * cells will be unlocked even if the calling thread is blocked.
- *
- * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
- */
-int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
-                  struct bio *inmate, struct dm_bio_prison_cell **ref)
-{
-        int r = 1;
-        unsigned long flags;
-        uint32_t hash = hash_key(prison, key);
-        struct dm_bio_prison_cell *cell, *cell2;
-        BUG_ON(hash > prison->nr_buckets);
-        spin_lock_irqsave(&prison->lock, flags);
-        cell = __search_bucket(prison->cells + hash, key);
-        if (cell) {
-                bio_list_add(&cell->bios, inmate);
-                goto out;
-        }
-        /*
-         * Allocate a new cell
-         */
-        spin_unlock_irqrestore(&prison->lock, flags);
-        cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
-        spin_lock_irqsave(&prison->lock, flags);
-        /*
-         * We've been unlocked, so we have to double check that
-         * nobody else has inserted this cell in the meantime.
-         */
-        cell = __search_bucket(prison->cells + hash, key);
-        if (cell) {
-                mempool_free(cell2, prison->cell_pool);
-                bio_list_add(&cell->bios, inmate);
-                goto out;
-        }
-        /*
-         * Use new cell.
-         */
-        cell = cell2;
-        cell->prison = prison;
-        memcpy(&cell->key, key, sizeof(cell->key));
-        cell->holder = inmate;
-        bio_list_init(&cell->bios);
-        hlist_add_head(&cell->list, prison->cells + hash);
-        r = 0;
-out:
-        spin_unlock_irqrestore(&prison->lock, flags);
-        *ref = cell;
-        return r;
-}
-EXPORT_SYMBOL_GPL(dm_bio_detain);
-/*
- * @inmates must have been initialised prior to this call
- */
-static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
-{
-        struct dm_bio_prison *prison = cell->prison;
-        hlist_del(&cell->list);
-        if (inmates) {
-                bio_list_add(inmates, cell->holder);
-                bio_list_merge(inmates, &cell->bios);
-        }
-        mempool_free(cell, prison->cell_pool);
-}
-void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
-{
-        unsigned long flags;
-        struct dm_bio_prison *prison = cell->prison;
-        spin_lock_irqsave(&prison->lock, flags);
-        __cell_release(cell, bios);
-        spin_unlock_irqrestore(&prison->lock, flags);
-}
-EXPORT_SYMBOL_GPL(dm_cell_release);
-/*
- * Sometimes we don't want the holder, just the additional bios.
- */
-static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
-{
-        struct dm_bio_prison *prison = cell->prison;
-        hlist_del(&cell->list);
-        bio_list_merge(inmates, &cell->bios);
-        mempool_free(cell, prison->cell_pool);
-}
-void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
-{
-        unsigned long flags;
-        struct dm_bio_prison *prison = cell->prison;
-        spin_lock_irqsave(&prison->lock, flags);
-        __cell_release_no_holder(cell, inmates);
-        spin_unlock_irqrestore(&prison->lock, flags);
-}
-EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
-void dm_cell_error(struct dm_bio_prison_cell *cell)
-{
-        struct dm_bio_prison *prison = cell->prison;
-        struct bio_list bios;
-        struct bio *bio;
-        unsigned long flags;
-        bio_list_init(&bios);
-        spin_lock_irqsave(&prison->lock, flags);
-        __cell_release(cell, &bios);
-        spin_unlock_irqrestore(&prison->lock, flags);
-        while ((bio = bio_list_pop(&bios)))
-                bio_io_error(bio);
-}
-EXPORT_SYMBOL_GPL(dm_cell_error);
-/*----------------------------------------------------------------*/
-#define DEFERRED_SET_SIZE 64
-struct dm_deferred_entry {
-        struct dm_deferred_set *ds;
-        unsigned count;
-        struct list_head work_items;
-};
-struct dm_deferred_set {
-        spinlock_t lock;
-        unsigned current_entry;
-        unsigned sweeper;
-        struct dm_deferred_entry entries[DEFERRED_SET_SIZE];
-};
-struct dm_deferred_set *dm_deferred_set_create(void)
-{
-        int i;
-        struct dm_deferred_set *ds;
-        ds = kmalloc(sizeof(*ds), GFP_KERNEL);
-        if (!ds)
-                return NULL;
-        spin_lock_init(&ds->lock);
-        ds->current_entry = 0;
-        ds->sweeper = 0;
-        for (i = 0; i < DEFERRED_SET_SIZE; i++) {
-                ds->entries[i].ds = ds;
-                ds->entries[i].count = 0;
-                INIT_LIST_HEAD(&ds->entries[i].work_items);
-        }
-        return ds;
-}
-EXPORT_SYMBOL_GPL(dm_deferred_set_create);
-void dm_deferred_set_destroy(struct dm_deferred_set *ds)
-{
-        kfree(ds);
-}
-EXPORT_SYMBOL_GPL(dm_deferred_set_destroy);
-struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds)
-{
-        unsigned long flags;
-        struct dm_deferred_entry *entry;
-        spin_lock_irqsave(&ds->lock, flags);
-        entry = ds->entries + ds->current_entry;
-        entry->count++;
-        spin_unlock_irqrestore(&ds->lock, flags);
-        return entry;
-}
-EXPORT_SYMBOL_GPL(dm_deferred_entry_inc);
-static unsigned ds_next(unsigned index)
-{
-        return (index + 1) % DEFERRED_SET_SIZE;
-}
-static void __sweep(struct dm_deferred_set *ds, struct list_head *head)
-{
-        while ((ds->sweeper != ds->current_entry) &&
-               !ds->entries[ds->sweeper].count) {
-                list_splice_init(&ds->entries[ds->sweeper].work_items, head);
-                ds->sweeper = ds_next(ds->sweeper);
-        }
-        if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
-                list_splice_init(&ds->entries[ds->sweeper].work_items, head);
-}
-void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&entry->ds->lock, flags);
-        BUG_ON(!entry->count);
-        --entry->count;
-        __sweep(entry->ds, head);
-        spin_unlock_irqrestore(&entry->ds->lock, flags);
-}
-EXPORT_SYMBOL_GPL(dm_deferred_entry_dec);
-/*
- * Returns 1 if deferred or 0 if no pending items to delay job.
- */
-int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work)
-{
-        int r = 1;
-        unsigned long flags;
-        unsigned next_entry;
-        spin_lock_irqsave(&ds->lock, flags);
-        if ((ds->sweeper == ds->current_entry) &&
-            !ds->entries[ds->current_entry].count)
-                r = 0;
-        else {
-                list_add(work, &ds->entries[ds->current_entry].work_items);
-                next_entry = ds_next(ds->current_entry);
-                if (!ds->entries[next_entry].count)
-                        ds->current_entry = next_entry;
-        }
-        spin_unlock_irqrestore(&ds->lock, flags);
-        return r;
-}
-EXPORT_SYMBOL_GPL(dm_deferred_set_add_work);
-/*----------------------------------------------------------------*/
-static int __init dm_bio_prison_init(void)
-{
-        _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
-        if (!_cell_cache)
-                return -ENOMEM;
-        return 0;
-}
-static void __exit dm_bio_prison_exit(void)
-{
-        kmem_cache_destroy(_cell_cache);
-        _cell_cache = NULL;
-}
-/*
- * module hooks
- */
-module_init(dm_bio_prison_init);
-module_exit(dm_bio_prison_exit);
-MODULE_DESCRIPTION(DM_NAME " bio prison");
-MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
deleted file mode 100644
index 53d1a7a84e2..00000000000
--- a/drivers/md/dm-bio-prison.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (C) 2011-2012 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#ifndef DM_BIO_PRISON_H
-#define DM_BIO_PRISON_H
-#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
-#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
-#include <linux/list.h>
-#include <linux/bio.h>
-/*----------------------------------------------------------------*/
-/*
- * Sometimes we can't deal with a bio straight away.  We put them in prison
- * where they can't cause any mischief.  Bios are put in a cell identified
- * by a key, multiple bios can be in the same cell.  When the cell is
- * subsequently unlocked the bios become available.
- */
-struct dm_bio_prison;
-struct dm_bio_prison_cell;
-/* FIXME: this needs to be more abstract */
-struct dm_cell_key {
-        int virtual;
-        dm_thin_id dev;
-        dm_block_t block;
-};
-struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells);
-void dm_bio_prison_destroy(struct dm_bio_prison *prison);
-/*
- * This may block if a new cell needs allocating.  You must ensure that
- * cells will be unlocked even if the calling thread is blocked.
- *
- * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
- */
-int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
-                  struct bio *inmate, struct dm_bio_prison_cell **ref);
-void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios);
-void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates);
-void dm_cell_error(struct dm_bio_prison_cell *cell);
-/*----------------------------------------------------------------*/
-/*
- * We use the deferred set to keep track of pending reads to shared blocks.
- * We do this to ensure the new mapping caused by a write isn't performed
- * until these prior reads have completed.  Otherwise the insertion of the
- * new mapping could free the old block that the read bios are mapped to.
- */
-struct dm_deferred_set;
-struct dm_deferred_entry;
-struct dm_deferred_set *dm_deferred_set_create(void);
-void dm_deferred_set_destroy(struct dm_deferred_set *ds);
-struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds);
-void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head);
-int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work);
-/*----------------------------------------------------------------*/
-#endif
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
deleted file mode 100644
index 651ca79881d..00000000000
--- a/drivers/md/dm-bufio.c
+++ /dev/null
@@ -1,1750 +0,0 @@
-/*
- * Copyright (C) 2009-2011 Red Hat, Inc.
- *
- * Author: Mikulas Patocka <mpatocka@redhat.com>
- *
- * This file is released under the GPL.
- */
-#include "dm-bufio.h"
-#include <linux/device-mapper.h>
-#include <linux/dm-io.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/shrinker.h>
-#include <linux/module.h>
-#define DM_MSG_PREFIX "bufio"
-/*
- * Memory management policy:
- *      Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
- *      or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
- *      Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
- *      Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
- *      dirty buffers.
- */
-#define DM_BUFIO_MIN_BUFFERS            8
-#define DM_BUFIO_MEMORY_PERCENT         2
-#define DM_BUFIO_VMALLOC_PERCENT        25
-#define DM_BUFIO_WRITEBACK_PERCENT      75
-/*
- * Check buffer ages in this interval (seconds)
- */
-#define DM_BUFIO_WORK_TIMER_SECS        10
-/*
- * Free buffers when they are older than this (seconds)
- */
-#define DM_BUFIO_DEFAULT_AGE_SECS       60
-/*
- * The number of bvec entries that are embedded directly in the buffer.
- * If the chunk size is larger, dm-io is used to do the io.
- */
-#define DM_BUFIO_INLINE_VECS            16
-/*
- * Buffer hash
- */
-#define DM_BUFIO_HASH_BITS      20
-#define DM_BUFIO_HASH(block) \
-        ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
-         ((1 << DM_BUFIO_HASH_BITS) - 1))
-/*
- * Don't try to use kmem_cache_alloc for blocks larger than this.
- * For explanation, see alloc_buffer_data below.
- */
-#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT  (PAGE_SIZE >> 1)
-#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT   (PAGE_SIZE << (MAX_ORDER - 1))
-/*
- * dm_buffer->list_mode
- */
-#define LIST_CLEAN      0
-#define LIST_DIRTY      1
-#define LIST_SIZE       2
-/*
- * Linking of buffers:
- *      All buffers are linked to cache_hash with their hash_list field.
- *
- *      Clean buffers that are not being written (B_WRITING not set)
- *      are linked to lru[LIST_CLEAN] with their lru_list field.
- *
- *      Dirty and clean buffers that are being written are linked to
- *      lru[LIST_DIRTY] with their lru_list field. When the write
- *      finishes, the buffer cannot be relinked immediately (because we
- *      are in an interrupt context and relinking requires process
- *      context), so some clean-not-writing buffers can be held on
- *      dirty_lru too.  They are later added to lru in the process
- *      context.
- */
-struct dm_bufio_client {
-        struct mutex lock;
-        struct list_head lru[LIST_SIZE];
-        unsigned long n_buffers[LIST_SIZE];
-        struct block_device *bdev;
-        unsigned block_size;
-        unsigned char sectors_per_block_bits;
-        unsigned char pages_per_block_bits;
-        unsigned char blocks_per_page_bits;
-        unsigned aux_size;
-        void (*alloc_callback)(struct dm_buffer *);
-        void (*write_callback)(struct dm_buffer *);
-        struct dm_io_client *dm_io;
-        struct list_head reserved_buffers;
-        unsigned need_reserved_buffers;
-        struct hlist_head *cache_hash;
-        wait_queue_head_t free_buffer_wait;
-        int async_write_error;
-        struct list_head client_list;
-        struct shrinker shrinker;
-};
-/*
- * Buffer state bits.
- */
-#define B_READING       0
-#define B_WRITING       1
-#define B_DIRTY         2
-/*
- * Describes how the block was allocated:
- * kmem_cache_alloc(), __get_free_pages() or vmalloc().
- * See the comment at alloc_buffer_data.
- */
-enum data_mode {
-        DATA_MODE_SLAB = 0,
-        DATA_MODE_GET_FREE_PAGES = 1,
-        DATA_MODE_VMALLOC = 2,
-        DATA_MODE_LIMIT = 3
-};
-struct dm_buffer {
-        struct hlist_node hash_list;
-        struct list_head lru_list;
-        sector_t block;
-        void *data;
-        enum data_mode data_mode;
-        unsigned char list_mode;                /* LIST_* */
-        unsigned hold_count;
-        int read_error;
-        int write_error;
-        unsigned long state;
-        unsigned long last_accessed;
-        struct dm_bufio_client *c;
-        struct bio bio;
-        struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
-};
-/*----------------------------------------------------------------*/
-static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT];
-static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT];
-static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
-{
-        unsigned ret = c->blocks_per_page_bits - 1;
-        BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches));
-        return ret;
-}
-#define DM_BUFIO_CACHE(c)       (dm_bufio_caches[dm_bufio_cache_index(c)])
-#define DM_BUFIO_CACHE_NAME(c)  (dm_bufio_cache_names[dm_bufio_cache_index(c)])
-#define dm_bufio_in_request()   (!!current->bio_list)
-static void dm_bufio_lock(struct dm_bufio_client *c)
-{
-        mutex_lock_nested(&c->lock, dm_bufio_in_request());
-}
-static int dm_bufio_trylock(struct dm_bufio_client *c)
-{
-        return mutex_trylock(&c->lock);
-}
-static void dm_bufio_unlock(struct dm_bufio_client *c)
-{
-        mutex_unlock(&c->lock);
-}
-/*
- * FIXME Move to sched.h?
- */
-#ifdef CONFIG_PREEMPT_VOLUNTARY
-#  define dm_bufio_cond_resched()               \
-do {                                            \
-        if (unlikely(need_resched()))           \
-                _cond_resched();                \
-} while (0)
-#else
-#  define dm_bufio_cond_resched()                do { } while (0)
-#endif
-/*----------------------------------------------------------------*/
-/*
- * Default cache size: available memory divided by the ratio.
- */
-static unsigned long dm_bufio_default_cache_size;
-/*
- * Total cache size set by the user.
- */
-static unsigned long dm_bufio_cache_size;
-/*
- * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
- * at any time.  If it disagrees, the user has changed cache size.
- */
-static unsigned long dm_bufio_cache_size_latch;
-static DEFINE_SPINLOCK(param_spinlock);
-/*
- * Buffers are freed after this timeout
- */
-static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
-static unsigned long dm_bufio_peak_allocated;
-static unsigned long dm_bufio_allocated_kmem_cache;
-static unsigned long dm_bufio_allocated_get_free_pages;
-static unsigned long dm_bufio_allocated_vmalloc;
-static unsigned long dm_bufio_current_allocated;
-/*----------------------------------------------------------------*/
-/*
- * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
- */
-static unsigned long dm_bufio_cache_size_per_client;
-/*
- * The current number of clients.
- */
-static int dm_bufio_client_count;
-/*
- * The list of all clients.
- */
-static LIST_HEAD(dm_bufio_all_clients);
-/*
- * This mutex protects dm_bufio_cache_size_latch,
- * dm_bufio_cache_size_per_client and dm_bufio_client_count
- */
-static DEFINE_MUTEX(dm_bufio_clients_lock);
-/*----------------------------------------------------------------*/
-static void adjust_total_allocated(enum data_mode data_mode, long diff)
-{
-        static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
-                &dm_bufio_allocated_kmem_cache,
-                &dm_bufio_allocated_get_free_pages,
-                &dm_bufio_allocated_vmalloc,
-        };
-        spin_lock(&param_spinlock);
-        *class_ptr[data_mode] += diff;
-        dm_bufio_current_allocated += diff;
-        if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
-                dm_bufio_peak_allocated = dm_bufio_current_allocated;
-        spin_unlock(&param_spinlock);
-}
-/*
- * Change the number of clients and recalculate per-client limit.
- */
-static void __cache_size_refresh(void)
-{
-        BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
-        BUG_ON(dm_bufio_client_count < 0);
-        dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size);
-        /*
-         * Use default if set to 0 and report the actual cache size used.
-         */
-        if (!dm_bufio_cache_size_latch) {
-                (void)cmpxchg(&dm_bufio_cache_size, 0,
-                              dm_bufio_default_cache_size);
-                dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
-        }
-        dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
-                                         (dm_bufio_client_count ? : 1);
-}
-/*
- * Allocating buffer data.
- *
- * Small buffers are allocated with kmem_cache, to use space optimally.
- *
- * For large buffers, we choose between get_free_pages and vmalloc.
- * Each has advantages and disadvantages.
- *
- * __get_free_pages can randomly fail if the memory is fragmented.
- * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
- * as low as 128M) so using it for caching is not appropriate.
- *
- * If the allocation may fail we use __get_free_pages. Memory fragmentation
- * won't have a fatal effect here, but it just causes flushes of some other
- * buffers and more I/O will be performed. Don't use __get_free_pages if it
- * always fails (i.e. order >= MAX_ORDER).
- *
- * If the allocation shouldn't fail we use __vmalloc. This is only for the
- * initial reserve allocation, so there's no risk of wasting all vmalloc
- * space.
- */
-static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
-                               enum data_mode *data_mode)
-{
-        if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
-                *data_mode = DATA_MODE_SLAB;
-                return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
-        }
-        if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT &&
-            gfp_mask & __GFP_NORETRY) {
-                *data_mode = DATA_MODE_GET_FREE_PAGES;
-                return (void *)__get_free_pages(gfp_mask,
-                                                c->pages_per_block_bits);
-        }
-        *data_mode = DATA_MODE_VMALLOC;
-        return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
-}
-/*
- * Free buffer's data.
- */
-static void free_buffer_data(struct dm_bufio_client *c,
-                             void *data, enum data_mode data_mode)
-{
-        switch (data_mode) {
-        case DATA_MODE_SLAB:
-                kmem_cache_free(DM_BUFIO_CACHE(c), data);
-                break;
-        case DATA_MODE_GET_FREE_PAGES:
-                free_pages((unsigned long)data, c->pages_per_block_bits);
-                break;
-        case DATA_MODE_VMALLOC:
-                vfree(data);
-                break;
-        default:
-                DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
-                       data_mode);
-                BUG();
-        }
-}
-/*
- * Allocate buffer and its data.
- */
-static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
-{
-        struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size,
-                                      gfp_mask);
-        if (!b)
-                return NULL;
-        b->c = c;
-        b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
-        if (!b->data) {
-                kfree(b);
-                return NULL;
-        }
-        adjust_total_allocated(b->data_mode, (long)c->block_size);
-        return b;
-}
-/*
- * Free buffer and its data.
- */
-static void free_buffer(struct dm_buffer *b)
-{
-        struct dm_bufio_client *c = b->c;
-        adjust_total_allocated(b->data_mode, -(long)c->block_size);
-        free_buffer_data(c, b->data, b->data_mode);
-        kfree(b);
-}
-/*
- * Link buffer to the hash list and clean or dirty queue.
- */
-static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
-{
-        struct dm_bufio_client *c = b->c;
-        c->n_buffers[dirty]++;
-        b->block = block;
-        b->list_mode = dirty;
-        list_add(&b->lru_list, &c->lru[dirty]);
-        hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
-        b->last_accessed = jiffies;
-}
-/*
- * Unlink buffer from the hash list and dirty or clean queue.
- */
-static void __unlink_buffer(struct dm_buffer *b)
-{
-        struct dm_bufio_client *c = b->c;
-        BUG_ON(!c->n_buffers[b->list_mode]);
-        c->n_buffers[b->list_mode]--;
-        hlist_del(&b->hash_list);
-        list_del(&b->lru_list);
-}
-/*
- * Place the buffer to the head of dirty or clean LRU queue.
- */
-static void __relink_lru(struct dm_buffer *b, int dirty)
-{
-        struct dm_bufio_client *c = b->c;
-        BUG_ON(!c->n_buffers[b->list_mode]);
-        c->n_buffers[b->list_mode]--;
-        c->n_buffers[dirty]++;
-        b->list_mode = dirty;
-        list_move(&b->lru_list, &c->lru[dirty]);
-}
-/*----------------------------------------------------------------
- * Submit I/O on the buffer.
- *
- * Bio interface is faster but it has some problems:
- *      the vector list is limited (increasing this limit increases
- *      memory-consumption per buffer, so it is not viable);
- *
- *      the memory must be direct-mapped, not vmalloced;
- *
- *      the I/O driver can reject requests spuriously if it thinks that
- *      the requests are too big for the device or if they cross a
- *      controller-defined memory boundary.
- *
- * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
- * it is not vmalloced, try using the bio interface.
- *
- * If the buffer is big, if it is vmalloced or if the underlying device
- * rejects the bio because it is too large, use dm-io layer to do the I/O.
- * The dm-io layer splits the I/O into multiple requests, avoiding the above
- * shortcomings.
- *--------------------------------------------------------------*/
-/*
- * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
- * that the request was handled directly with bio interface.
- */
-static void dmio_complete(unsigned long error, void *context)
-{
-        struct dm_buffer *b = context;
-        b->bio.bi_end_io(&b->bio, error ? -EIO : 0);
-}
-static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
-                     bio_end_io_t *end_io)
-{
-        int r;
-        struct dm_io_request io_req = {
-                .bi_rw = rw,
-                .notify.fn = dmio_complete,
-                .notify.context = b,
-                .client = b->c->dm_io,
-        };
-        struct dm_io_region region = {
-                .bdev = b->c->bdev,
-                .sector = block << b->c->sectors_per_block_bits,
-                .count = b->c->block_size >> SECTOR_SHIFT,
-        };
-        if (b->data_mode != DATA_MODE_VMALLOC) {
-                io_req.mem.type = DM_IO_KMEM;
-                io_req.mem.ptr.addr = b->data;
-        } else {
-                io_req.mem.type = DM_IO_VMA;
-                io_req.mem.ptr.vma = b->data;
-        }
-        b->bio.bi_end_io = end_io;
-        r = dm_io(&io_req, 1, &region, NULL);
-        if (r)
-                end_io(&b->bio, r);
-}
-static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
-                           bio_end_io_t *end_io)
-{
-        char *ptr;
-        int len;
-        bio_init(&b->bio);
-        b->bio.bi_io_vec = b->bio_vec;
-        b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
-        b->bio.bi_sector = block << b->c->sectors_per_block_bits;
-        b->bio.bi_bdev = b->c->bdev;
-        b->bio.bi_end_io = end_io;
-        /*
-         * We assume that if len >= PAGE_SIZE ptr is page-aligned.
-         * If len < PAGE_SIZE the buffer doesn't cross page boundary.
-         */
-        ptr = b->data;
-        len = b->c->block_size;
-        if (len >= PAGE_SIZE)
-                BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
-        else
-                BUG_ON((unsigned long)ptr & (len - 1));
-        do {
-                if (!bio_add_page(&b->bio, virt_to_page(ptr),
-                                  len < PAGE_SIZE ? len : PAGE_SIZE,
-                                  virt_to_phys(ptr) & (PAGE_SIZE - 1))) {
-                        BUG_ON(b->c->block_size <= PAGE_SIZE);
-                        use_dmio(b, rw, block, end_io);
-                        return;
-                }
-                len -= PAGE_SIZE;
-                ptr += PAGE_SIZE;
-        } while (len > 0);
-        submit_bio(rw, &b->bio);
-}
-static void submit_io(struct dm_buffer *b, int rw, sector_t block,
-                      bio_end_io_t *end_io)
-{
-        if (rw == WRITE && b->c->write_callback)
-                b->c->write_callback(b);
-        if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
-            b->data_mode != DATA_MODE_VMALLOC)
-                use_inline_bio(b, rw, block, end_io);
-        else
-                use_dmio(b, rw, block, end_io);
-}
-/*----------------------------------------------------------------
- * Writing dirty buffers
- *--------------------------------------------------------------*/
-/*
- * The endio routine for write.
- *
- * Set the error, clear B_WRITING bit and wake anyone who was waiting on
- * it.
- */
-static void write_endio(struct bio *bio, int error)
-{
-        struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
-        b->write_error = error;
-        if (unlikely(error)) {
-                struct dm_bufio_client *c = b->c;
-                (void)cmpxchg(&c->async_write_error, 0, error);
-        }
-        BUG_ON(!test_bit(B_WRITING, &b->state));
-        smp_mb__before_clear_bit();
-        clear_bit(B_WRITING, &b->state);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&b->state, B_WRITING);
-}
-/*
- * This function is called when wait_on_bit is actually waiting.
- */
-static int do_io_schedule(void *word)
-{
-        io_schedule();
-        return 0;
-}
-/*
- * Initiate a write on a dirty buffer, but don't wait for it.
- *
- * - If the buffer is not dirty, exit.
- * - If there some previous write going on, wait for it to finish (we can't
- *   have two writes on the same buffer simultaneously).
- * - Submit our write and don't wait on it. We set B_WRITING indicating
- *   that there is a write in progress.
- */
-static void __write_dirty_buffer(struct dm_buffer *b)
-{
-        if (!test_bit(B_DIRTY, &b->state))
-                return;
-        clear_bit(B_DIRTY, &b->state);
-        wait_on_bit_lock(&b->state, B_WRITING,
-                         do_io_schedule, TASK_UNINTERRUPTIBLE);
-        submit_io(b, WRITE, b->block, write_endio);
-}
-/*
- * Wait until any activity on the buffer finishes.  Possibly write the
- * buffer if it is dirty.  When this function finishes, there is no I/O
- * running on the buffer and the buffer is not dirty.
- */
-static void __make_buffer_clean(struct dm_buffer *b)
-{
-        BUG_ON(b->hold_count);
-        if (!b->state)  /* fast case */
-                return;
-        wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
-        __write_dirty_buffer(b);
-        wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
-}
-/*
- * Find some buffer that is not held by anybody, clean it, unlink it and
- * return it.
- */
-static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
-{
-        struct dm_buffer *b;
-        list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
-                BUG_ON(test_bit(B_WRITING, &b->state));
-                BUG_ON(test_bit(B_DIRTY, &b->state));
-                if (!b->hold_count) {
-                        __make_buffer_clean(b);
-                        __unlink_buffer(b);
-                        return b;
-                }
-                dm_bufio_cond_resched();
-        }
-        list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
-                BUG_ON(test_bit(B_READING, &b->state));
-                if (!b->hold_count) {
-                        __make_buffer_clean(b);
-                        __unlink_buffer(b);
-                        return b;
-                }
-                dm_bufio_cond_resched();
-        }
-        return NULL;
-}
-/*
- * Wait until some other threads free some buffer or release hold count on
- * some buffer.
- *
- * This function is entered with c->lock held, drops it and regains it
- * before exiting.
- */
-static void __wait_for_free_buffer(struct dm_bufio_client *c)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        add_wait_queue(&c->free_buffer_wait, &wait);
-        set_task_state(current, TASK_UNINTERRUPTIBLE);
-        dm_bufio_unlock(c);
-        io_schedule();
-        set_task_state(current, TASK_RUNNING);
-        remove_wait_queue(&c->free_buffer_wait, &wait);
-        dm_bufio_lock(c);
-}
-enum new_flag {
-        NF_FRESH = 0,
-        NF_READ = 1,
-        NF_GET = 2,
-        NF_PREFETCH = 3
-};
-/*
- * Allocate a new buffer. If the allocation is not possible, wait until
- * some other thread frees a buffer.
- *
- * May drop the lock and regain it.
- */
-static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
-{
-        struct dm_buffer *b;
-        /*
-         * dm-bufio is resistant to allocation failures (it just keeps
-         * one buffer reserved in cases all the allocations fail).
-         * So set flags to not try too hard:
-         *      GFP_NOIO: don't recurse into the I/O layer
-         *      __GFP_NORETRY: don't retry and rather return failure
-         *      __GFP_NOMEMALLOC: don't use emergency reserves
-         *      __GFP_NOWARN: don't print a warning in case of failure
-         *
-         * For debugging, if we set the cache size to 1, no new buffers will
-         * be allocated.
-         */
-        while (1) {
-                if (dm_bufio_cache_size_latch != 1) {
-                        b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
-                        if (b)
-                                return b;
-                }
-                if (nf == NF_PREFETCH)
-                        return NULL;
-                if (!list_empty(&c->reserved_buffers)) {
-                        b = list_entry(c->reserved_buffers.next,
-                                       struct dm_buffer, lru_list);
-                        list_del(&b->lru_list);
-                        c->need_reserved_buffers++;
-                        return b;
-                }
-                b = __get_unclaimed_buffer(c);
-                if (b)
-                        return b;
-                __wait_for_free_buffer(c);
-        }
-}
-static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
-{
-        struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
-        if (!b)
-                return NULL;
-        if (c->alloc_callback)
-                c->alloc_callback(b);
-        return b;
-}
-/*
- * Free a buffer and wake other threads waiting for free buffers.
- */
-static void __free_buffer_wake(struct dm_buffer *b)
-{
-        struct dm_bufio_client *c = b->c;
-        if (!c->need_reserved_buffers)
-                free_buffer(b);
-        else {
-                list_add(&b->lru_list, &c->reserved_buffers);
-                c->need_reserved_buffers--;
-        }
-        wake_up(&c->free_buffer_wait);
-}
-static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
-{
-        struct dm_buffer *b, *tmp;
-        list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
-                BUG_ON(test_bit(B_READING, &b->state));
-                if (!test_bit(B_DIRTY, &b->state) &&
-                    !test_bit(B_WRITING, &b->state)) {
-                        __relink_lru(b, LIST_CLEAN);
-                        continue;
-                }
-                if (no_wait && test_bit(B_WRITING, &b->state))
-                        return;
-                __write_dirty_buffer(b);
-                dm_bufio_cond_resched();
-        }
-}
-/*
- * Get writeback threshold and buffer limit for a given client.
- */
-static void __get_memory_limit(struct dm_bufio_client *c,
-                               unsigned long *threshold_buffers,
-                               unsigned long *limit_buffers)
-{
-        unsigned long buffers;
-        if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) {
-                mutex_lock(&dm_bufio_clients_lock);
-                __cache_size_refresh();
-                mutex_unlock(&dm_bufio_clients_lock);
-        }
-        buffers = dm_bufio_cache_size_per_client >>
-                  (c->sectors_per_block_bits + SECTOR_SHIFT);
-        if (buffers < DM_BUFIO_MIN_BUFFERS)
-                buffers = DM_BUFIO_MIN_BUFFERS;
-        *limit_buffers = buffers;
-        *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
-}
-/*
- * Check if we're over watermark.
- * If we are over threshold_buffers, start freeing buffers.
- * If we're over "limit_buffers", block until we get under the limit.
- */
-static void __check_watermark(struct dm_bufio_client *c)
-{
-        unsigned long threshold_buffers, limit_buffers;
-        __get_memory_limit(c, &threshold_buffers, &limit_buffers);
-        while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] >
-               limit_buffers) {
-                struct dm_buffer *b = __get_unclaimed_buffer(c);
-                if (!b)
-                        return;
-                __free_buffer_wake(b);
-                dm_bufio_cond_resched();
-        }
-        if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
-                __write_dirty_buffers_async(c, 1);
-}
-/*
- * Find a buffer in the hash.
- */
-static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
-{
-        struct dm_buffer *b;
-        struct hlist_node *hn;
-        hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)],
-                             hash_list) {
-                dm_bufio_cond_resched();
-                if (b->block == block)
-                        return b;
-        }
-        return NULL;
-}
-/*----------------------------------------------------------------
- * Getting a buffer
- *--------------------------------------------------------------*/
-static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
-                                     enum new_flag nf, int *need_submit)
-{
-        struct dm_buffer *b, *new_b = NULL;
-        *need_submit = 0;
-        b = __find(c, block);
-        if (b)
-                goto found_buffer;
-        if (nf == NF_GET)
-                return NULL;
-        new_b = __alloc_buffer_wait(c, nf);
-        if (!new_b)
-                return NULL;
-        /*
-         * We've had a period where the mutex was unlocked, so need to
-         * recheck the hash table.
-         */
-        b = __find(c, block);
-        if (b) {
-                __free_buffer_wake(new_b);
-                goto found_buffer;
-        }
-        __check_watermark(c);
-        b = new_b;
-        b->hold_count = 1;
-        b->read_error = 0;
-        b->write_error = 0;
-        __link_buffer(b, block, LIST_CLEAN);
-        if (nf == NF_FRESH) {
-                b->state = 0;
-                return b;
-        }
-        b->state = 1 << B_READING;
-        *need_submit = 1;
-        return b;
-found_buffer:
-        if (nf == NF_PREFETCH)
-                return NULL;
-        /*
-         * Note: it is essential that we don't wait for the buffer to be
-         * read if dm_bufio_get function is used. Both dm_bufio_get and
-         * dm_bufio_prefetch can be used in the driver request routine.
-         * If the user called both dm_bufio_prefetch and dm_bufio_get on
-         * the same buffer, it would deadlock if we waited.
-         */
-        if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
-                return NULL;
-        b->hold_count++;
-        __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
-                     test_bit(B_WRITING, &b->state));
-        return b;
-}
-/*
- * The endio routine for reading: set the error, clear the bit and wake up
- * anyone waiting on the buffer.
- */
-static void read_endio(struct bio *bio, int error)
-{
-        struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
-        b->read_error = error;
-        BUG_ON(!test_bit(B_READING, &b->state));
-        smp_mb__before_clear_bit();
-        clear_bit(B_READING, &b->state);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&b->state, B_READING);
-}
-/*
- * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
- * functions is similar except that dm_bufio_new doesn't read the
- * buffer from the disk (assuming that the caller overwrites all the data
- * and uses dm_bufio_mark_buffer_dirty to write new data back).
- */
-static void *new_read(struct dm_bufio_client *c, sector_t block,
-                      enum new_flag nf, struct dm_buffer **bp)
-{
-        int need_submit;
-        struct dm_buffer *b;
-        dm_bufio_lock(c);
-        b = __bufio_new(c, block, nf, &need_submit);
-        dm_bufio_unlock(c);
-        if (!b)
-                return b;
-        if (need_submit)
-                submit_io(b, READ, b->block, read_endio);
-        wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
-        if (b->read_error) {
-                int error = b->read_error;
-                dm_bufio_release(b);
-                return ERR_PTR(error);
-        }
-        *bp = b;
-        return b->data;
-}
-void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
-                   struct dm_buffer **bp)
-{
-        return new_read(c, block, NF_GET, bp);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_get);
-void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
-                    struct dm_buffer **bp)
-{
-        BUG_ON(dm_bufio_in_request());
-        return new_read(c, block, NF_READ, bp);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_read);
-void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
-                   struct dm_buffer **bp)
-{
-        BUG_ON(dm_bufio_in_request());
-        return new_read(c, block, NF_FRESH, bp);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_new);
-void dm_bufio_prefetch(struct dm_bufio_client *c,
-                       sector_t block, unsigned n_blocks)
-{
-        struct blk_plug plug;
-        blk_start_plug(&plug);
-        dm_bufio_lock(c);
-        for (; n_blocks--; block++) {
-                int need_submit;
-                struct dm_buffer *b;
-                b = __bufio_new(c, block, NF_PREFETCH, &need_submit);
-                if (unlikely(b != NULL)) {
-                        dm_bufio_unlock(c);
-                        if (need_submit)
-                                submit_io(b, READ, b->block, read_endio);
-                        dm_bufio_release(b);
-                        dm_bufio_cond_resched();
-                        if (!n_blocks)
-                                goto flush_plug;
-                        dm_bufio_lock(c);
-                }
-        }
-        dm_bufio_unlock(c);
-flush_plug:
-        blk_finish_plug(&plug);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
-void dm_bufio_release(struct dm_buffer *b)
-{
-        struct dm_bufio_client *c = b->c;
-        dm_bufio_lock(c);
-        BUG_ON(!b->hold_count);
-        b->hold_count--;
-        if (!b->hold_count) {
-                wake_up(&c->free_buffer_wait);
-                /*
-                 * If there were errors on the buffer, and the buffer is not
-                 * to be written, free the buffer. There is no point in caching
-                 * invalid buffer.
-                 */
-                if ((b->read_error || b->write_error) &&
-                    !test_bit(B_READING, &b->state) &&
-                    !test_bit(B_WRITING, &b->state) &&
-                    !test_bit(B_DIRTY, &b->state)) {
-                        __unlink_buffer(b);
-                        __free_buffer_wake(b);
-                }
-        }
-        dm_bufio_unlock(c);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_release);
-void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
-{
-        struct dm_bufio_client *c = b->c;
-        dm_bufio_lock(c);
-        BUG_ON(test_bit(B_READING, &b->state));
-        if (!test_and_set_bit(B_DIRTY, &b->state))
-                __relink_lru(b, LIST_DIRTY);
-        dm_bufio_unlock(c);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
-void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
-{
-        BUG_ON(dm_bufio_in_request());
-        dm_bufio_lock(c);
-        __write_dirty_buffers_async(c, 0);
-        dm_bufio_unlock(c);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
-/*
- * For performance, it is essential that the buffers are written asynchronously
- * and simultaneously (so that the block layer can merge the writes) and then
- * waited upon.
- *
- * Finally, we flush hardware disk cache.
- */
-int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
-{
-        int a, f;
-        unsigned long buffers_processed = 0;
-        struct dm_buffer *b, *tmp;
-        dm_bufio_lock(c);
-        __write_dirty_buffers_async(c, 0);
-again:
-        list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
-                int dropped_lock = 0;
-                if (buffers_processed < c->n_buffers[LIST_DIRTY])
-                        buffers_processed++;
-                BUG_ON(test_bit(B_READING, &b->state));
-                if (test_bit(B_WRITING, &b->state)) {
-                        if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
-                                dropped_lock = 1;
-                                b->hold_count++;
-                                dm_bufio_unlock(c);
-                                wait_on_bit(&b->state, B_WRITING,
-                                            do_io_schedule,
-                                            TASK_UNINTERRUPTIBLE);
-                                dm_bufio_lock(c);
-                                b->hold_count--;
-                        } else
-                                wait_on_bit(&b->state, B_WRITING,
-                                            do_io_schedule,
-                                            TASK_UNINTERRUPTIBLE);
-                }
-                if (!test_bit(B_DIRTY, &b->state) &&
-                    !test_bit(B_WRITING, &b->state))
-                        __relink_lru(b, LIST_CLEAN);
-                dm_bufio_cond_resched();
-                /*
-                 * If we dropped the lock, the list is no longer consistent,
-                 * so we must restart the search.
-                 *
-                 * In the most common case, the buffer just processed is
-                 * relinked to the clean list, so we won't loop scanning the
-                 * same buffer again and again.
-                 *
-                 * This may livelock if there is another thread simultaneously
-                 * dirtying buffers, so we count the number of buffers walked
-                 * and if it exceeds the total number of buffers, it means that
-                 * someone is doing some writes simultaneously with us.  In
-                 * this case, stop, dropping the lock.
-                 */
-                if (dropped_lock)
-                        goto again;
-        }
-        wake_up(&c->free_buffer_wait);
-        dm_bufio_unlock(c);
-        a = xchg(&c->async_write_error, 0);
-        f = dm_bufio_issue_flush(c);
-        if (a)
-                return a;
-        return f;
-}
-EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
-/*
- * Use dm-io to send and empty barrier flush the device.
- */
-int dm_bufio_issue_flush(struct dm_bufio_client *c)
-{
-        struct dm_io_request io_req = {
-                .bi_rw = REQ_FLUSH,
-                .mem.type = DM_IO_KMEM,
-                .mem.ptr.addr = NULL,
-                .client = c->dm_io,
-        };
-        struct dm_io_region io_reg = {
-                .bdev = c->bdev,
-                .sector = 0,
-                .count = 0,
-        };
-        BUG_ON(dm_bufio_in_request());
-        return dm_io(&io_req, 1, &io_reg, NULL);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
-/*
- * We first delete any other buffer that may be at that new location.
- *
- * Then, we write the buffer to the original location if it was dirty.
- *
- * Then, if we are the only one who is holding the buffer, relink the buffer
- * in the hash queue for the new location.
- *
- * If there was someone else holding the buffer, we write it to the new
- * location but not relink it, because that other user needs to have the buffer
- * at the same place.
- */
-void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
-{
-        struct dm_bufio_client *c = b->c;
-        struct dm_buffer *new;
-        BUG_ON(dm_bufio_in_request());
-        dm_bufio_lock(c);
-retry:
-        new = __find(c, new_block);
-        if (new) {
-                if (new->hold_count) {
-                        __wait_for_free_buffer(c);
-                        goto retry;
-                }
-                /*
-                 * FIXME: Is there any point waiting for a write that's going
-                 * to be overwritten in a bit?
-                 */
-                __make_buffer_clean(new);
-                __unlink_buffer(new);
-                __free_buffer_wake(new);
-        }
-        BUG_ON(!b->hold_count);
-        BUG_ON(test_bit(B_READING, &b->state));
-        __write_dirty_buffer(b);
-        if (b->hold_count == 1) {
-                wait_on_bit(&b->state, B_WRITING,
-                            do_io_schedule, TASK_UNINTERRUPTIBLE);
-                set_bit(B_DIRTY, &b->state);
-                __unlink_buffer(b);
-                __link_buffer(b, new_block, LIST_DIRTY);
-        } else {
-                sector_t old_block;
-                wait_on_bit_lock(&b->state, B_WRITING,
-                                 do_io_schedule, TASK_UNINTERRUPTIBLE);
-                /*
-                 * Relink buffer to "new_block" so that write_callback
-                 * sees "new_block" as a block number.
-                 * After the write, link the buffer back to old_block.
-                 * All this must be done in bufio lock, so that block number
-                 * change isn't visible to other threads.
-                 */
-                old_block = b->block;
-                __unlink_buffer(b);
-                __link_buffer(b, new_block, b->list_mode);
-                submit_io(b, WRITE, new_block, write_endio);
-                wait_on_bit(&b->state, B_WRITING,
-                            do_io_schedule, TASK_UNINTERRUPTIBLE);
-                __unlink_buffer(b);
-                __link_buffer(b, old_block, b->list_mode);
-        }
-        dm_bufio_unlock(c);
-        dm_bufio_release(b);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_release_move);
-unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
-{
-        return c->block_size;
-}
-EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
-sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
-{
-        return i_size_read(c->bdev->bd_inode) >>
-                           (SECTOR_SHIFT + c->sectors_per_block_bits);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
-sector_t dm_bufio_get_block_number(struct dm_buffer *b)
-{
-        return b->block;
-}
-EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
-void *dm_bufio_get_block_data(struct dm_buffer *b)
-{
-        return b->data;
-}
-EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
-void *dm_bufio_get_aux_data(struct dm_buffer *b)
-{
-        return b + 1;
-}
-EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
-struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
-{
-        return b->c;
-}
-EXPORT_SYMBOL_GPL(dm_bufio_get_client);
-static void drop_buffers(struct dm_bufio_client *c)
-{
-        struct dm_buffer *b;
-        int i;
-        BUG_ON(dm_bufio_in_request());
-        /*
-         * An optimization so that the buffers are not written one-by-one.
-         */
-        dm_bufio_write_dirty_buffers_async(c);
-        dm_bufio_lock(c);
-        while ((b = __get_unclaimed_buffer(c)))
-                __free_buffer_wake(b);
-        for (i = 0; i < LIST_SIZE; i++)
-                list_for_each_entry(b, &c->lru[i], lru_list)
-                        DMERR("leaked buffer %llx, hold count %u, list %d",
-                              (unsigned long long)b->block, b->hold_count, i);
-        for (i = 0; i < LIST_SIZE; i++)
-                BUG_ON(!list_empty(&c->lru[i]));
-        dm_bufio_unlock(c);
-}
-/*
- * Test if the buffer is unused and too old, and commit it.
- * At if noio is set, we must not do any I/O because we hold
- * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to
- * different bufio client.
- */
-static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
-                                unsigned long max_jiffies)
-{
-        if (jiffies - b->last_accessed < max_jiffies)
-                return 1;
-        if (!(gfp & __GFP_IO)) {
-                if (test_bit(B_READING, &b->state) ||
-                    test_bit(B_WRITING, &b->state) ||
-                    test_bit(B_DIRTY, &b->state))
-                        return 1;
-        }
-        if (b->hold_count)
-                return 1;
-        __make_buffer_clean(b);
-        __unlink_buffer(b);
-        __free_buffer_wake(b);
-        return 0;
-}
-static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
-                   struct shrink_control *sc)
-{
-        int l;
-        struct dm_buffer *b, *tmp;
-        for (l = 0; l < LIST_SIZE; l++) {
-                list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list)
-                        if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) &&
-                            !--nr_to_scan)
-                                return;
-                dm_bufio_cond_resched();
-        }
-}
-static int shrink(struct shrinker *shrinker, struct shrink_control *sc)
-{
-        struct dm_bufio_client *c =
-            container_of(shrinker, struct dm_bufio_client, shrinker);
-        unsigned long r;
-        unsigned long nr_to_scan = sc->nr_to_scan;
-        if (sc->gfp_mask & __GFP_IO)
-                dm_bufio_lock(c);
-        else if (!dm_bufio_trylock(c))
-                return !nr_to_scan ? 0 : -1;
-        if (nr_to_scan)
-                __scan(c, nr_to_scan, sc);
-        r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
-        if (r > INT_MAX)
-                r = INT_MAX;
-        dm_bufio_unlock(c);
-        return r;
-}
-/*
- * Create the buffering interface
- */
-struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
-                                               unsigned reserved_buffers, unsigned aux_size,
-                                               void (*alloc_callback)(struct dm_buffer *),
-                                               void (*write_callback)(struct dm_buffer *))
-{
-        int r;
-        struct dm_bufio_client *c;
-        unsigned i;
-        BUG_ON(block_size < 1 << SECTOR_SHIFT ||
-               (block_size & (block_size - 1)));
-        c = kmalloc(sizeof(*c), GFP_KERNEL);
-        if (!c) {
-                r = -ENOMEM;
-                goto bad_client;
-        }
-        c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS);
-        if (!c->cache_hash) {
-                r = -ENOMEM;
-                goto bad_hash;
-        }
-        c->bdev = bdev;
-        c->block_size = block_size;
-        c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT;
-        c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ?
-                                  ffs(block_size) - 1 - PAGE_SHIFT : 0;
-        c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ?
-                                  PAGE_SHIFT - (ffs(block_size) - 1) : 0);
-        c->aux_size = aux_size;
-        c->alloc_callback = alloc_callback;
-        c->write_callback = write_callback;
-        for (i = 0; i < LIST_SIZE; i++) {
-                INIT_LIST_HEAD(&c->lru[i]);
-                c->n_buffers[i] = 0;
-        }
-        for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
-                INIT_HLIST_HEAD(&c->cache_hash[i]);
-        mutex_init(&c->lock);
-        INIT_LIST_HEAD(&c->reserved_buffers);
-        c->need_reserved_buffers = reserved_buffers;
-        init_waitqueue_head(&c->free_buffer_wait);
-        c->async_write_error = 0;
-        c->dm_io = dm_io_client_create();
-        if (IS_ERR(c->dm_io)) {
-                r = PTR_ERR(c->dm_io);
-                goto bad_dm_io;
-        }
-        mutex_lock(&dm_bufio_clients_lock);
-        if (c->blocks_per_page_bits) {
-                if (!DM_BUFIO_CACHE_NAME(c)) {
-                        DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size);
-                        if (!DM_BUFIO_CACHE_NAME(c)) {
-                                r = -ENOMEM;
-                                mutex_unlock(&dm_bufio_clients_lock);
-                                goto bad_cache;
-                        }
-                }
-                if (!DM_BUFIO_CACHE(c)) {
-                        DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c),
-                                                              c->block_size,
-                                                              c->block_size, 0, NULL);
-                        if (!DM_BUFIO_CACHE(c)) {
-                                r = -ENOMEM;
-                                mutex_unlock(&dm_bufio_clients_lock);
-                                goto bad_cache;
-                        }
-                }
-        }
-        mutex_unlock(&dm_bufio_clients_lock);
-        while (c->need_reserved_buffers) {
-                struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
-                if (!b) {
-                        r = -ENOMEM;
-                        goto bad_buffer;
-                }
-                __free_buffer_wake(b);
-        }
-        mutex_lock(&dm_bufio_clients_lock);
-        dm_bufio_client_count++;
-        list_add(&c->client_list, &dm_bufio_all_clients);
-        __cache_size_refresh();
-        mutex_unlock(&dm_bufio_clients_lock);
-        c->shrinker.shrink = shrink;
-        c->shrinker.seeks = 1;
-        c->shrinker.batch = 0;
-        register_shrinker(&c->shrinker);
-        return c;
-bad_buffer:
-bad_cache:
-        while (!list_empty(&c->reserved_buffers)) {
-                struct dm_buffer *b = list_entry(c->reserved_buffers.next,
-                                                 struct dm_buffer, lru_list);
-                list_del(&b->lru_list);
-                free_buffer(b);
-        }
-        dm_io_client_destroy(c->dm_io);
-bad_dm_io:
-        vfree(c->cache_hash);
-bad_hash:
-        kfree(c);
-bad_client:
-        return ERR_PTR(r);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_client_create);
-/*
- * Free the buffering interface.
- * It is required that there are no references on any buffers.
- */
-void dm_bufio_client_destroy(struct dm_bufio_client *c)
-{
-        unsigned i;
-        drop_buffers(c);
-        unregister_shrinker(&c->shrinker);
-        mutex_lock(&dm_bufio_clients_lock);
-        list_del(&c->client_list);
-        dm_bufio_client_count--;
-        __cache_size_refresh();
-        mutex_unlock(&dm_bufio_clients_lock);
-        for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
-                BUG_ON(!hlist_empty(&c->cache_hash[i]));
-        BUG_ON(c->need_reserved_buffers);
-        while (!list_empty(&c->reserved_buffers)) {
-                struct dm_buffer *b = list_entry(c->reserved_buffers.next,
-                                                 struct dm_buffer, lru_list);
-                list_del(&b->lru_list);
-                free_buffer(b);
-        }
-        for (i = 0; i < LIST_SIZE; i++)
-                if (c->n_buffers[i])
-                        DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
-        for (i = 0; i < LIST_SIZE; i++)
-                BUG_ON(c->n_buffers[i]);
-        dm_io_client_destroy(c->dm_io);
-        vfree(c->cache_hash);
-        kfree(c);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
-static void cleanup_old_buffers(void)
-{
-        unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age);
-        struct dm_bufio_client *c;
-        if (max_age > ULONG_MAX / HZ)
-                max_age = ULONG_MAX / HZ;
-        mutex_lock(&dm_bufio_clients_lock);
-        list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
-                if (!dm_bufio_trylock(c))
-                        continue;
-                while (!list_empty(&c->lru[LIST_CLEAN])) {
-                        struct dm_buffer *b;
-                        b = list_entry(c->lru[LIST_CLEAN].prev,
-                                       struct dm_buffer, lru_list);
-                        if (__cleanup_old_buffer(b, 0, max_age * HZ))
-                                break;
-                        dm_bufio_cond_resched();
-                }
-                dm_bufio_unlock(c);
-                dm_bufio_cond_resched();
-        }
-        mutex_unlock(&dm_bufio_clients_lock);
-}
-static struct workqueue_struct *dm_bufio_wq;
-static struct delayed_work dm_bufio_work;
-static void work_fn(struct work_struct *w)
-{
-        cleanup_old_buffers();
-        queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
-                           DM_BUFIO_WORK_TIMER_SECS * HZ);
-}
-/*----------------------------------------------------------------
- * Module setup
- *--------------------------------------------------------------*/
-/*
- * This is called only once for the whole dm_bufio module.
- * It initializes memory limit.
- */
-static int __init dm_bufio_init(void)
-{
-        __u64 mem;
-        memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
-        memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
-        mem = (__u64)((totalram_pages - totalhigh_pages) *
-                      DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT;
-        if (mem > ULONG_MAX)
-                mem = ULONG_MAX;
-#ifdef CONFIG_MMU
-        /*
-         * Get the size of vmalloc space the same way as VMALLOC_TOTAL
-         * in fs/proc/internal.h
-         */
-        if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100)
-                mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100;
-#endif
-        dm_bufio_default_cache_size = mem;
-        mutex_lock(&dm_bufio_clients_lock);
-        __cache_size_refresh();
-        mutex_unlock(&dm_bufio_clients_lock);
-        dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache");
-        if (!dm_bufio_wq)
-                return -ENOMEM;
-        INIT_DELAYED_WORK(&dm_bufio_work, work_fn);
-        queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
-                           DM_BUFIO_WORK_TIMER_SECS * HZ);
-        return 0;
-}
-/*
- * This is called once when unloading the dm_bufio module.
- */
-static void __exit dm_bufio_exit(void)
-{
-        int bug = 0;
-        int i;
-        cancel_delayed_work_sync(&dm_bufio_work);
-        destroy_workqueue(dm_bufio_wq);
-        for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) {
-                struct kmem_cache *kc = dm_bufio_caches[i];
-                if (kc)
-                        kmem_cache_destroy(kc);
-        }
-        for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++)
-                kfree(dm_bufio_cache_names[i]);
-        if (dm_bufio_client_count) {
-                DMCRIT("%s: dm_bufio_client_count leaked: %d",
-                        __func__, dm_bufio_client_count);
-                bug = 1;
-        }
-        if (dm_bufio_current_allocated) {
-                DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
-                        __func__, dm_bufio_current_allocated);
-                bug = 1;
-        }
-        if (dm_bufio_allocated_get_free_pages) {
-                DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
-                       __func__, dm_bufio_allocated_get_free_pages);
-                bug = 1;
-        }
-        if (dm_bufio_allocated_vmalloc) {
-                DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
-                       __func__, dm_bufio_allocated_vmalloc);
-                bug = 1;
-        }
-        if (bug)
-                BUG();
-}
-module_init(dm_bufio_init)
-module_exit(dm_bufio_exit)
-module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
-module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
-module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
-module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
-MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
-module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
-MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
-module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
-MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
-module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
-MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
-MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
-MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
-MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
deleted file mode 100644
index b142946a9e3..00000000000
--- a/drivers/md/dm-bufio.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (C) 2009-2011 Red Hat, Inc.
- *
- * Author: Mikulas Patocka <mpatocka@redhat.com>
- *
- * This file is released under the GPL.
- */
-#ifndef DM_BUFIO_H
-#define DM_BUFIO_H
-#include <linux/blkdev.h>
-#include <linux/types.h>
-/*----------------------------------------------------------------*/
-struct dm_bufio_client;
-struct dm_buffer;
-/*
- * Create a buffered IO cache on a given device
- */
-struct dm_bufio_client *
-dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
-                       unsigned reserved_buffers, unsigned aux_size,
-                       void (*alloc_callback)(struct dm_buffer *),
-                       void (*write_callback)(struct dm_buffer *));
-/*
- * Release a buffered IO cache.
- */
-void dm_bufio_client_destroy(struct dm_bufio_client *c);
-/*
- * WARNING: to avoid deadlocks, these conditions are observed:
- *
- * - At most one thread can hold at most "reserved_buffers" simultaneously.
- * - Each other threads can hold at most one buffer.
- * - Threads which call only dm_bufio_get can hold unlimited number of
- *   buffers.
- */
-/*
- * Read a given block from disk. Returns pointer to data.  Returns a
- * pointer to dm_buffer that can be used to release the buffer or to make
- * it dirty.
- */
-void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
-                    struct dm_buffer **bp);
-/*
- * Like dm_bufio_read, but return buffer from cache, don't read
- * it. If the buffer is not in the cache, return NULL.
- */
-void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
-                   struct dm_buffer **bp);
-/*
- * Like dm_bufio_read, but don't read anything from the disk.  It is
- * expected that the caller initializes the buffer and marks it dirty.
- */
-void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
-                   struct dm_buffer **bp);
-/*
- * Prefetch the specified blocks to the cache.
- * The function starts to read the blocks and returns without waiting for
- * I/O to finish.
- */
-void dm_bufio_prefetch(struct dm_bufio_client *c,
-                       sector_t block, unsigned n_blocks);
-/*
- * Release a reference obtained with dm_bufio_{read,get,new}. The data
- * pointer and dm_buffer pointer is no longer valid after this call.
- */
-void dm_bufio_release(struct dm_buffer *b);
-/*
- * Mark a buffer dirty. It should be called after the buffer is modified.
- *
- * In case of memory pressure, the buffer may be written after
- * dm_bufio_mark_buffer_dirty, but before dm_bufio_write_dirty_buffers.  So
- * dm_bufio_write_dirty_buffers guarantees that the buffer is on-disk but
- * the actual writing may occur earlier.
- */
-void dm_bufio_mark_buffer_dirty(struct dm_buffer *b);
-/*
- * Initiate writing of dirty buffers, without waiting for completion.
- */
-void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c);
-/*
- * Write all dirty buffers. Guarantees that all dirty buffers created prior
- * to this call are on disk when this call exits.
- */
-int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c);
-/*
- * Send an empty write barrier to the device to flush hardware disk cache.
- */
-int dm_bufio_issue_flush(struct dm_bufio_client *c);
-/*
- * Like dm_bufio_release but also move the buffer to the new
- * block. dm_bufio_write_dirty_buffers is needed to commit the new block.
- */
-void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
-unsigned dm_bufio_get_block_size(struct dm_bufio_client *c);
-sector_t dm_bufio_get_device_size(struct dm_bufio_client *c);
-sector_t dm_bufio_get_block_number(struct dm_buffer *b);
-void *dm_bufio_get_block_data(struct dm_buffer *b);
-void *dm_bufio_get_aux_data(struct dm_buffer *b);
-struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b);
-/*----------------------------------------------------------------*/
-#endif
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index f7369f9d859..1f1d3423d39 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,14 +18,10 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
-#include <linux/percpu.h>
+#include <asm/atomic.h>
-#include <linux/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
 #include <asm/unaligned.h>
-#include <crypto/hash.h>
-#include <crypto/md5.h>
-#include <crypto/algapi.h>
 #include <linux/device-mapper.h>
@@ -42,21 +38,21 @@ struct convert_context {
        unsigned int offset_out;
        unsigned int idx_in;
        unsigned int idx_out;
-        sector_t cc_sector;
+        sector_t sector;
-        atomic_t cc_pending;
+        atomic_t pending;
 };
 /*
 * per bio private data
 */
 struct dm_crypt_io {
-        struct crypt_config *cc;
+        struct dm_target *target;
        struct bio *base_bio;
        struct work_struct work;
        struct convert_context ctx;
-        atomic_t io_pending;
+        atomic_t pending;
        int error;
        sector_t sector;
        struct dm_crypt_io *base_io;
@@ -66,7 +62,6 @@ struct dm_crypt_request {
        struct convert_context *ctx;
        struct scatterlist sg_in;
        struct scatterlist sg_out;
-        sector_t iv_sector;
 };
 struct crypt_config;
@@ -77,13 +72,11 @@ struct crypt_iv_operations {
        void (*dtr)(struct crypt_config *cc);
        int (*init)(struct crypt_config *cc);
        int (*wipe)(struct crypt_config *cc);
-        int (*generator)(struct crypt_config *cc, u8 *iv,
+        int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
-                         struct dm_crypt_request *dmreq);
-        int (*post)(struct crypt_config *cc, u8 *iv,
-                    struct dm_crypt_request *dmreq);
 };
 struct iv_essiv_private {
+        struct crypto_cipher *tfm;
        struct crypto_hash *hash_tfm;
        u8 *salt;
 };
@@ -92,29 +85,11 @@ struct iv_benbi_private {
        int shift;
 };
-#define LMK_SEED_SIZE 64 /* hash + 0 */
-struct iv_lmk_private {
-        struct crypto_shash *hash_tfm;
-        u8 *seed;
-};
 /*
 * Crypt: maps a linear range of a block device
 * and encrypts / decrypts at the same time.
 */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
-/*
- * Duplicated per-CPU state for cipher.
- */
-struct crypt_cpu {
-        struct ablkcipher_request *req;
-};
-/*
- * The fields in here must be read only after initialization,
- * changing state should be in crypt_cpu.
- */
 struct crypt_config {
        struct dm_dev *dev;
        sector_t start;
@@ -138,23 +113,11 @@ struct crypt_config {
        union {
                struct iv_essiv_private essiv;
                struct iv_benbi_private benbi;
-                struct iv_lmk_private lmk;
        } iv_gen_private;
        sector_t iv_offset;
        unsigned int iv_size;
        /*
-         * Duplicated per cpu state. Access through
-         * per_cpu_ptr() only.
-         */
-        struct crypt_cpu __percpu *cpu;
-        /* ESSIV: struct crypto_cipher *essiv_tfm */
-        void *iv_private;
-        struct crypto_ablkcipher **tfms;
-        unsigned tfms_count;
-        /*
         * Layout of each crypto request:
         *
         *   struct ablkcipher_request
@@ -168,34 +131,22 @@ struct crypt_config {
         * correctly aligned.
         */
        unsigned int dmreq_start;
+        struct ablkcipher_request *req;
+        struct crypto_ablkcipher *tfm;
        unsigned long flags;
        unsigned int key_size;
-        unsigned int key_parts;
        u8 key[0];
 };
 #define MIN_IOS        16
 #define MIN_POOL_PAGES 32
+#define MIN_BIO_PAGES  8
 static struct kmem_cache *_crypt_io_pool;
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
-static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
-static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
-{
-        return this_cpu_ptr(cc->cpu);
-}
-/*
- * Use this to access cipher attributes that are the same for each CPU.
- */
-static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
-{
-        return cc->tfms[0];
-}
 /*
 * Different IV generation algorithms:
@@ -216,38 +167,23 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
 * null: the initial vector is always zero.  Provides compatibility with
 *       obsolete loop_fish2 devices.  Do not use for new devices.
 *
- * lmk:  Compatible implementation of the block chaining mode used
- *       by the Loop-AES block device encryption system
- *       designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
- *       It operates on full 512 byte sectors and uses CBC
- *       with an IV derived from the sector number, the data and
- *       optionally extra IV seed.
- *       This means that after decryption the first block
- *       of sector must be tweaked according to decrypted data.
- *       Loop-AES can use three encryption schemes:
- *         version 1: is plain aes-cbc mode
- *         version 2: uses 64 multikey scheme with lmk IV generator
- *         version 3: the same as version 2 with additional IV seed
- *                   (it uses 65 keys, last key is used as IV seed)
- *
 * plumb: unimplemented, see:
 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
 */
-static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
-                              struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
-        *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
+        *(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
        return 0;
 }
 static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
-                                struct dm_crypt_request *dmreq)
+                                sector_t sector)
 {
        memset(iv, 0, cc->iv_size);
-        *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
+        *(u64 *)iv = cpu_to_le64(sector);
        return 0;
 }
@@ -258,7 +194,6 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
        struct hash_desc desc;
        struct scatterlist sg;
-        struct crypto_cipher *essiv_tfm;
        int err;
        sg_init_one(&sg, cc->key, cc->key_size);
@@ -269,14 +204,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
        if (err)
                return err;
-        essiv_tfm = cc->iv_private;
+        return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+                                    crypto_hash_digestsize(essiv->hash_tfm));
-        err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
-                            crypto_hash_digestsize(essiv->hash_tfm));
-        if (err)
-                return err;
-        return 0;
 }
 /* Wipe salt and reset key derived from volume key */
@@ -284,69 +213,24 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 {
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
        unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
-        struct crypto_cipher *essiv_tfm;
-        int r, err = 0;
        memset(essiv->salt, 0, salt_size);
-        essiv_tfm = cc->iv_private;
+        return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
-        r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
-        if (r)
-                err = r;
-        return err;
-}
-/* Set up per cpu cipher state */
-static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
-                                             struct dm_target *ti,
-                                             u8 *salt, unsigned saltsize)
-{
-        struct crypto_cipher *essiv_tfm;
-        int err;
-        /* Setup the essiv_tfm with the given salt */
-        essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
-        if (IS_ERR(essiv_tfm)) {
-                ti->error = "Error allocating crypto tfm for ESSIV";
-                return essiv_tfm;
-        }
-        if (crypto_cipher_blocksize(essiv_tfm) !=
-            crypto_ablkcipher_ivsize(any_tfm(cc))) {
-                ti->error = "Block size of ESSIV cipher does "
-                            "not match IV size of block cipher";
-                crypto_free_cipher(essiv_tfm);
-                return ERR_PTR(-EINVAL);
-        }
-        err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
-        if (err) {
-                ti->error = "Failed to set key for ESSIV cipher";
-                crypto_free_cipher(essiv_tfm);
-                return ERR_PTR(err);
-        }
-        return essiv_tfm;
 }
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
-        struct crypto_cipher *essiv_tfm;
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
+        crypto_free_cipher(essiv->tfm);
+        essiv->tfm = NULL;
        crypto_free_hash(essiv->hash_tfm);
        essiv->hash_tfm = NULL;
        kzfree(essiv->salt);
        essiv->salt = NULL;
-        essiv_tfm = cc->iv_private;
-        if (essiv_tfm)
-                crypto_free_cipher(essiv_tfm);
-        cc->iv_private = NULL;
 }
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -377,42 +261,48 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
                goto bad;
        }
-        cc->iv_gen_private.essiv.salt = salt;
+        /* Allocate essiv_tfm */
-        cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
+        essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
-        essiv_tfm = setup_essiv_cpu(cc, ti, salt,
-                                crypto_hash_digestsize(hash_tfm));
        if (IS_ERR(essiv_tfm)) {
-                crypt_iv_essiv_dtr(cc);
+                ti->error = "Error allocating crypto tfm for ESSIV";
-                return PTR_ERR(essiv_tfm);
+                err = PTR_ERR(essiv_tfm);
+                goto bad;
        }
-        cc->iv_private = essiv_tfm;
+        if (crypto_cipher_blocksize(essiv_tfm) !=
+            crypto_ablkcipher_ivsize(cc->tfm)) {
+                ti->error = "Block size of ESSIV cipher does "
+                            "not match IV size of block cipher";
+                err = -EINVAL;
+                goto bad;
+        }
+        cc->iv_gen_private.essiv.salt = salt;
+        cc->iv_gen_private.essiv.tfm = essiv_tfm;
+        cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
        return 0;
 bad:
+        if (essiv_tfm && !IS_ERR(essiv_tfm))
+                crypto_free_cipher(essiv_tfm);
        if (hash_tfm && !IS_ERR(hash_tfm))
                crypto_free_hash(hash_tfm);
        kfree(salt);
        return err;
 }
-static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
-                              struct dm_crypt_request *dmreq)
 {
-        struct crypto_cipher *essiv_tfm = cc->iv_private;
        memset(iv, 0, cc->iv_size);
-        *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
+        *(u64 *)iv = cpu_to_le64(sector);
-        crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
+        crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
        return 0;
 }
 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
                              const char *opts)
 {
-        unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
+        unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
        int log = ilog2(bs);
        /* we need to calculate how far we must shift the sector count
@@ -437,177 +327,25 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
 {
 }
-static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
-                              struct dm_crypt_request *dmreq)
 {
        __be64 val;
        memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
-        val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
+        val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
        put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
        return 0;
 }
-static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
+static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
-                             struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
        return 0;
 }
-static void crypt_iv_lmk_dtr(struct crypt_config *cc)
-{
-        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
-        if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
-                crypto_free_shash(lmk->hash_tfm);
-        lmk->hash_tfm = NULL;
-        kzfree(lmk->seed);
-        lmk->seed = NULL;
-}
-static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
-                            const char *opts)
-{
-        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
-        lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
-        if (IS_ERR(lmk->hash_tfm)) {
-                ti->error = "Error initializing LMK hash";
-                return PTR_ERR(lmk->hash_tfm);
-        }
-        /* No seed in LMK version 2 */
-        if (cc->key_parts == cc->tfms_count) {
-                lmk->seed = NULL;
-                return 0;
-        }
-        lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
-        if (!lmk->seed) {
-                crypt_iv_lmk_dtr(cc);
-                ti->error = "Error kmallocing seed storage in LMK";
-                return -ENOMEM;
-        }
-        return 0;
-}
-static int crypt_iv_lmk_init(struct crypt_config *cc)
-{
-        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
-        int subkey_size = cc->key_size / cc->key_parts;
-        /* LMK seed is on the position of LMK_KEYS + 1 key */
-        if (lmk->seed)
-                memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
-                       crypto_shash_digestsize(lmk->hash_tfm));
-        return 0;
-}
-static int crypt_iv_lmk_wipe(struct crypt_config *cc)
-{
-        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
-        if (lmk->seed)
-                memset(lmk->seed, 0, LMK_SEED_SIZE);
-        return 0;
-}
-static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
-                            struct dm_crypt_request *dmreq,
-                            u8 *data)
-{
-        struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
-        struct {
-                struct shash_desc desc;
-                char ctx[crypto_shash_descsize(lmk->hash_tfm)];
-        } sdesc;
-        struct md5_state md5state;
-        u32 buf[4];
-        int i, r;
-        sdesc.desc.tfm = lmk->hash_tfm;
-        sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-        r = crypto_shash_init(&sdesc.desc);
-        if (r)
-                return r;
-        if (lmk->seed) {
-                r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
-                if (r)
-                        return r;
-        }
-        /* Sector is always 512B, block size 16, add data of blocks 1-31 */
-        r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
-        if (r)
-                return r;
-        /* Sector is cropped to 56 bits here */
-        buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
-        buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
-        buf[2] = cpu_to_le32(4024);
-        buf[3] = 0;
-        r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
-        if (r)
-                return r;
-        /* No MD5 padding here */
-        r = crypto_shash_export(&sdesc.desc, &md5state);
-        if (r)
-                return r;
-        for (i = 0; i < MD5_HASH_WORDS; i++)
-                __cpu_to_le32s(&md5state.hash[i]);
-        memcpy(iv, &md5state.hash, cc->iv_size);
-        return 0;
-}
-static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
-                            struct dm_crypt_request *dmreq)
-{
-        u8 *src;
-        int r = 0;
-        if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
-                src = kmap_atomic(sg_page(&dmreq->sg_in));
-                r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
-                kunmap_atomic(src);
-        } else
-                memset(iv, 0, cc->iv_size);
-        return r;
-}
-static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
-                             struct dm_crypt_request *dmreq)
-{
-        u8 *dst;
-        int r;
-        if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
-                return 0;
-        dst = kmap_atomic(sg_page(&dmreq->sg_out));
-        r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
-        /* Tweak the first block of plaintext sector */
-        if (!r)
-                crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
-        kunmap_atomic(dst);
-        return r;
-}
 static struct crypt_iv_operations crypt_iv_plain_ops = {
        .generator = crypt_iv_plain_gen
 };
@@ -634,15 +372,6 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
        .generator = crypt_iv_null_gen
 };
-static struct crypt_iv_operations crypt_iv_lmk_ops = {
-        .ctr       = crypt_iv_lmk_ctr,
-        .dtr       = crypt_iv_lmk_dtr,
-        .init      = crypt_iv_lmk_init,
-        .wipe      = crypt_iv_lmk_wipe,
-        .generator = crypt_iv_lmk_gen,
-        .post      = crypt_iv_lmk_post
-};
 static void crypt_convert_init(struct crypt_config *cc,
                               struct convert_context *ctx,
                               struct bio *bio_out, struct bio *bio_in,
@@ -654,7 +383,7 @@ static void crypt_convert_init(struct crypt_config *cc,
        ctx->offset_out = 0;
        ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
        ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
-        ctx->cc_sector = sector + cc->iv_offset;
+        ctx->sector = sector + cc->iv_offset;
        init_completion(&ctx->restart);
 }
@@ -670,13 +399,6 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
        return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
 }
-static u8 *iv_of_dmreq(struct crypt_config *cc,
-                       struct dm_crypt_request *dmreq)
-{
-        return (u8 *)ALIGN((unsigned long)(dmreq + 1),
-                crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
-}
 static int crypt_convert_block(struct crypt_config *cc,
                               struct convert_context *ctx,
                               struct ablkcipher_request *req)
@@ -685,12 +407,12 @@ static int crypt_convert_block(struct crypt_config *cc,
        struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
        struct dm_crypt_request *dmreq;
        u8 *iv;
-        int r;
+        int r = 0;
        dmreq = dmreq_of_req(cc, req);
-        iv = iv_of_dmreq(cc, dmreq);
+        iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
+                         crypto_ablkcipher_alignmask(cc->tfm) + 1);
-        dmreq->iv_sector = ctx->cc_sector;
        dmreq->ctx = ctx;
        sg_init_table(&dmreq->sg_in, 1);
        sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -713,7 +435,7 @@ static int crypt_convert_block(struct crypt_config *cc,
        }
        if (cc->iv_gen_ops) {
-                r = cc->iv_gen_ops->generator(cc, iv, dmreq);
+                r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
                if (r < 0)
                        return r;
        }
@@ -726,28 +448,21 @@ static int crypt_convert_block(struct crypt_config *cc,
        else
                r = crypto_ablkcipher_decrypt(req);
-        if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
-                r = cc->iv_gen_ops->post(cc, iv, dmreq);
        return r;
 }
 static void kcryptd_async_done(struct crypto_async_request *async_req,
                               int error);
 static void crypt_alloc_req(struct crypt_config *cc,
                            struct convert_context *ctx)
 {
-        struct crypt_cpu *this_cc = this_crypt_config(cc);
+        if (!cc->req)
-        unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
+                cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+        ablkcipher_request_set_tfm(cc->req, cc->tfm);
-        if (!this_cc->req)
+        ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
-                this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+                                        CRYPTO_TFM_REQ_MAY_SLEEP,
+                                        kcryptd_async_done,
-        ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]);
+                                        dmreq_of_req(cc, cc->req));
-        ablkcipher_request_set_callback(this_cc->req,
-            CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-            kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
 }
 /*
@@ -756,19 +471,18 @@ static void crypt_alloc_req(struct crypt_config *cc,
 static int crypt_convert(struct crypt_config *cc,
                         struct convert_context *ctx)
 {
-        struct crypt_cpu *this_cc = this_crypt_config(cc);
        int r;
-        atomic_set(&ctx->cc_pending, 1);
+        atomic_set(&ctx->pending, 1);
        while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
              ctx->idx_out < ctx->bio_out->bi_vcnt) {
                crypt_alloc_req(cc, ctx);
-                atomic_inc(&ctx->cc_pending);
+                atomic_inc(&ctx->pending);
-                r = crypt_convert_block(cc, ctx, this_cc->req);
+                r = crypt_convert_block(cc, ctx, cc->req);
                switch (r) {
                /* async */
@@ -777,20 +491,20 @@ static int crypt_convert(struct crypt_config *cc,
                        INIT_COMPLETION(ctx->restart);
                        /* fall through*/
                case -EINPROGRESS:
-                        this_cc->req = NULL;
+                        cc->req = NULL;
-                        ctx->cc_sector++;
+                        ctx->sector++;
                        continue;
                /* sync */
                case 0:
-                        atomic_dec(&ctx->cc_pending);
+                        atomic_dec(&ctx->pending);
-                        ctx->cc_sector++;
+                        ctx->sector++;
                        cond_resched();
                        continue;
                /* error */
                default:
-                        atomic_dec(&ctx->cc_pending);
+                        atomic_dec(&ctx->pending);
                        return r;
                }
        }
@@ -798,6 +512,14 @@ static int crypt_convert(struct crypt_config *cc,
        return 0;
 }
+static void dm_crypt_bio_destructor(struct bio *bio)
+{
+        struct dm_crypt_io *io = bio->bi_private;
+        struct crypt_config *cc = io->target->private;
+        bio_free(bio, cc->bs);
+}
 /*
 * Generate a new unfragmented bio with the given size
 * This should never violate the device limitations
@@ -807,7 +529,7 @@ static int crypt_convert(struct crypt_config *cc,
 static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
                                      unsigned *out_of_pages)
 {
-        struct crypt_config *cc = io->cc;
+        struct crypt_config *cc = io->target->private;
        struct bio *clone;
        unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
@@ -829,11 +551,12 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
                }
                /*
-                 * If additional pages cannot be allocated without waiting,
+                 * if additional pages cannot be allocated without waiting,
-                 * return a partially-allocated bio.  The caller will then try
+                 * return a partially allocated bio, the caller will then try
-                 * to allocate more bios while submitting this partial bio.
+                 * to allocate additional bios while submitting this partial bio
                 */
-                gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
+                if (i == (MIN_BIO_PAGES - 1))
+                        gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
                len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
@@ -866,25 +589,26 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
        }
 }
-static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
+static struct dm_crypt_io *crypt_io_alloc(struct dm_target *ti,
                                          struct bio *bio, sector_t sector)
 {
+        struct crypt_config *cc = ti->private;
        struct dm_crypt_io *io;
        io = mempool_alloc(cc->io_pool, GFP_NOIO);
-        io->cc = cc;
+        io->target = ti;
        io->base_bio = bio;
        io->sector = sector;
        io->error = 0;
        io->base_io = NULL;
-        atomic_set(&io->io_pending, 0);
+        atomic_set(&io->pending, 0);
        return io;
 }
 static void crypt_inc_pending(struct dm_crypt_io *io)
 {
-        atomic_inc(&io->io_pending);
+        atomic_inc(&io->pending);
 }
 /*
@@ -894,12 +618,12 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
 */
 static void crypt_dec_pending(struct dm_crypt_io *io)
 {
-        struct crypt_config *cc = io->cc;
+        struct crypt_config *cc = io->target->private;
        struct bio *base_bio = io->base_bio;
        struct dm_crypt_io *base_io = io->base_io;
        int error = io->error;
-        if (!atomic_dec_and_test(&io->io_pending))
+        if (!atomic_dec_and_test(&io->pending))
                return;
        mempool_free(io, cc->io_pool);
@@ -926,14 +650,11 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 * They must be separated as otherwise the final stages could be
 * starved by new requests which can block in the first stages due
 * to memory allocation.
- *
- * The work is done per CPU global for all dm-crypt instances.
- * They should not depend on each other and do not block.
 */
 static void crypt_endio(struct bio *clone, int error)
 {
        struct dm_crypt_io *io = clone->bi_private;
-        struct crypt_config *cc = io->cc;
+        struct crypt_config *cc = io->target->private;
        unsigned rw = bio_data_dir(clone);
        if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
@@ -960,36 +681,44 @@ static void crypt_endio(struct bio *clone, int error)
 static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 {
-        struct crypt_config *cc = io->cc;
+        struct crypt_config *cc = io->target->private;
        clone->bi_private = io;
        clone->bi_end_io  = crypt_endio;
        clone->bi_bdev    = cc->dev->bdev;
        clone->bi_rw      = io->base_bio->bi_rw;
+        clone->bi_destructor = dm_crypt_bio_destructor;
 }
-static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
+static void kcryptd_io_read(struct dm_crypt_io *io)
 {
-        struct crypt_config *cc = io->cc;
+        struct crypt_config *cc = io->target->private;
        struct bio *base_bio = io->base_bio;
        struct bio *clone;
+        crypt_inc_pending(io);
        /*
         * The block layer might modify the bvec array, so always
         * copy the required bvecs because we need the original
         * one in order to decrypt the whole bio data *afterwards*.
         */
-        clone = bio_clone_bioset(base_bio, gfp, cc->bs);
+        clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
-        if (!clone)
+        if (unlikely(!clone)) {
-                return 1;
+                io->error = -ENOMEM;
+                crypt_dec_pending(io);
-        crypt_inc_pending(io);
+                return;
+        }
        clone_init(io, clone);
+        clone->bi_idx = 0;
+        clone->bi_vcnt = bio_segments(base_bio);
+        clone->bi_size = base_bio->bi_size;
        clone->bi_sector = cc->start + io->sector;
+        memcpy(clone->bi_io_vec, bio_iovec(base_bio),
+               sizeof(struct bio_vec) * clone->bi_vcnt);
        generic_make_request(clone);
-        return 0;
 }
 static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -1002,31 +731,30 @@ static void kcryptd_io(struct work_struct *work)
 {
        struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
-        if (bio_data_dir(io->base_bio) == READ) {
+        if (bio_data_dir(io->base_bio) == READ)
-                crypt_inc_pending(io);
+                kcryptd_io_read(io);
-                if (kcryptd_io_read(io, GFP_NOIO))
+        else
-                        io->error = -ENOMEM;
-                crypt_dec_pending(io);
-        } else
                kcryptd_io_write(io);
 }
 static void kcryptd_queue_io(struct dm_crypt_io *io)
 {
-        struct crypt_config *cc = io->cc;
+        struct crypt_config *cc = io->target->private;
        INIT_WORK(&io->work, kcryptd_io);
        queue_work(cc->io_queue, &io->work);
 }
-static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
+static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
+                                          int error, int async)
 {
        struct bio *clone = io->ctx.bio_out;
-        struct crypt_config *cc = io->cc;
+        struct crypt_config *cc = io->target->private;
-        if (unlikely(io->error < 0)) {
+        if (unlikely(error < 0)) {
                crypt_free_buffer_pages(cc, clone);
                bio_put(clone);
+                io->error = -EIO;
                crypt_dec_pending(io);
                return;
        }
@@ -1044,7 +772,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 {
-        struct crypt_config *cc = io->cc;
+        struct crypt_config *cc = io->target->private;
        struct bio *clone;
        struct dm_crypt_io *new_io;
        int crypt_finished;
@@ -1077,16 +805,12 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
                sector += bio_sectors(clone);
                crypt_inc_pending(io);
                r = crypt_convert(cc, &io->ctx);
-                if (r < 0)
+                crypt_finished = atomic_dec_and_test(&io->ctx.pending);
-                        io->error = -EIO;
-                crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
                /* Encryption was already finished, submit io now */
                if (crypt_finished) {
-                        kcryptd_crypt_write_io_submit(io, 0);
+                        kcryptd_crypt_write_io_submit(io, r, 0);
                        /*
                         * If there was an error, do not try next fragments.
@@ -1110,7 +834,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
                 * between fragments, so switch to a new dm_crypt_io structure.
                 */
                if (unlikely(!crypt_finished && remaining)) {
-                        new_io = crypt_io_alloc(io->cc, io->base_bio,
+                        new_io = crypt_io_alloc(io->target, io->base_bio,
                                                sector);
                        crypt_inc_pending(new_io);
                        crypt_convert_init(cc, &new_io->ctx, NULL,
@@ -1137,14 +861,17 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
        crypt_dec_pending(io);
 }
-static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
+static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error)
 {
+        if (unlikely(error < 0))
+                io->error = -EIO;
        crypt_dec_pending(io);
 }
 static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 {
-        struct crypt_config *cc = io->cc;
+        struct crypt_config *cc = io->target->private;
        int r = 0;
        crypt_inc_pending(io);
@@ -1153,11 +880,9 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
                           io->sector);
        r = crypt_convert(cc, &io->ctx);
-        if (r < 0)
-                io->error = -EIO;
-        if (atomic_dec_and_test(&io->ctx.cc_pending))
+        if (atomic_dec_and_test(&io->ctx.pending))
-                kcryptd_crypt_read_done(io);
+                kcryptd_crypt_read_done(io, r);
        crypt_dec_pending(io);
 }
@@ -1168,28 +893,22 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
        struct dm_crypt_request *dmreq = async_req->data;
        struct convert_context *ctx = dmreq->ctx;
        struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
-        struct crypt_config *cc = io->cc;
+        struct crypt_config *cc = io->target->private;
        if (error == -EINPROGRESS) {
                complete(&ctx->restart);
                return;
        }
-        if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
-                error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
-        if (error < 0)
-                io->error = -EIO;
        mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
-        if (!atomic_dec_and_test(&ctx->cc_pending))
+        if (!atomic_dec_and_test(&ctx->pending))
                return;
        if (bio_data_dir(io->base_bio) == READ)
-                kcryptd_crypt_read_done(io);
+                kcryptd_crypt_read_done(io, error);
        else
-                kcryptd_crypt_write_io_submit(io, 1);
+                kcryptd_crypt_write_io_submit(io, error, 1);
 }
 static void kcryptd_crypt(struct work_struct *work)
@@ -1204,7 +923,7 @@ static void kcryptd_crypt(struct work_struct *work)
 static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 {
-        struct crypt_config *cc = io->cc;
+        struct crypt_config *cc = io->target->private;
        INIT_WORK(&io->work, kcryptd_crypt);
        queue_work(cc->crypt_queue, &io->work);
@@ -1216,6 +935,7 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
 {
        char buffer[3];
+        char *endp;
        unsigned int i;
        buffer[2] = '\0';
@@ -1224,7 +944,9 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
                buffer[0] = *hex++;
                buffer[1] = *hex++;
-                if (kstrtou8(buffer, 16, &key[i]))
+                key[i] = (u8)simple_strtoul(buffer, &endp, 16);
+                if (endp != &buffer[2])
                        return -EINVAL;
        }
@@ -1248,101 +970,34 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
        }
 }
-static void crypt_free_tfms(struct crypt_config *cc)
-{
-        unsigned i;
-        if (!cc->tfms)
-                return;
-        for (i = 0; i < cc->tfms_count; i++)
-                if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) {
-                        crypto_free_ablkcipher(cc->tfms[i]);
-                        cc->tfms[i] = NULL;
-                }
-        kfree(cc->tfms);
-        cc->tfms = NULL;
-}
-static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
-{
-        unsigned i;
-        int err;
-        cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_ablkcipher *),
-                           GFP_KERNEL);
-        if (!cc->tfms)
-                return -ENOMEM;
-        for (i = 0; i < cc->tfms_count; i++) {
-                cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
-                if (IS_ERR(cc->tfms[i])) {
-                        err = PTR_ERR(cc->tfms[i]);
-                        crypt_free_tfms(cc);
-                        return err;
-                }
-        }
-        return 0;
-}
-static int crypt_setkey_allcpus(struct crypt_config *cc)
-{
-        unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
-        int err = 0, i, r;
-        for (i = 0; i < cc->tfms_count; i++) {
-                r = crypto_ablkcipher_setkey(cc->tfms[i],
-                                             cc->key + (i * subkey_size),
-                                             subkey_size);
-                if (r)
-                        err = r;
-        }
-        return err;
-}
 static int crypt_set_key(struct crypt_config *cc, char *key)
 {
-        int r = -EINVAL;
-        int key_string_len = strlen(key);
        /* The key size may not be changed. */
-        if (cc->key_size != (key_string_len >> 1))
+        if (cc->key_size != (strlen(key) >> 1))
-                goto out;
+                return -EINVAL;
        /* Hyphen (which gives a key_size of zero) means there is no key. */
        if (!cc->key_size && strcmp(key, "-"))
-                goto out;
+                return -EINVAL;
        if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
-                goto out;
+                return -EINVAL;
        set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
-        r = crypt_setkey_allcpus(cc);
+        return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
-out:
-        /* Hex key string not needed after here, so wipe it. */
-        memset(key, '0', key_string_len);
-        return r;
 }
 static int crypt_wipe_key(struct crypt_config *cc)
 {
        clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
        memset(&cc->key, 0, cc->key_size * sizeof(u8));
+        return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
-        return crypt_setkey_allcpus(cc);
 }
 static void crypt_dtr(struct dm_target *ti)
 {
        struct crypt_config *cc = ti->private;
-        struct crypt_cpu *cpu_cc;
-        int cpu;
        ti->private = NULL;
@@ -1354,15 +1009,6 @@ static void crypt_dtr(struct dm_target *ti)
        if (cc->crypt_queue)
                destroy_workqueue(cc->crypt_queue);
-        if (cc->cpu)
-                for_each_possible_cpu(cpu) {
-                        cpu_cc = per_cpu_ptr(cc->cpu, cpu);
-                        if (cpu_cc->req)
-                                mempool_free(cpu_cc->req, cc->req_pool);
-                }
-        crypt_free_tfms(cc);
        if (cc->bs)
                bioset_free(cc->bs);
@@ -1376,12 +1022,12 @@ static void crypt_dtr(struct dm_target *ti)
        if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
                cc->iv_gen_ops->dtr(cc);
+        if (cc->tfm && !IS_ERR(cc->tfm))
+                crypto_free_ablkcipher(cc->tfm);
        if (cc->dev)
                dm_put_device(ti, cc->dev);
-        if (cc->cpu)
-                free_percpu(cc->cpu);
        kzfree(cc->cipher);
        kzfree(cc->cipher_string);
@@ -1393,10 +1039,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                            char *cipher_in, char *key)
 {
        struct crypt_config *cc = ti->private;
-        char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
+        char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
        char *cipher_api = NULL;
        int ret = -EINVAL;
-        char dummy;
        /* Convert to crypto api definition? */
        if (strchr(cipher_in, '(')) {
@@ -1410,20 +1055,10 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        /*
         * Legacy dm-crypt cipher specification
-         * cipher[:keycount]-mode-iv:ivopts
+         * cipher-mode-iv:ivopts
         */
        tmp = cipher_in;
-        keycount = strsep(&tmp, "-");
+        cipher = strsep(&tmp, "-");
-        cipher = strsep(&keycount, ":");
-        if (!keycount)
-                cc->tfms_count = 1;
-        else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 ||
-                 !is_power_of_2(cc->tfms_count)) {
-                ti->error = "Bad cipher key count specification";
-                return -EINVAL;
-        }
-        cc->key_parts = cc->tfms_count;
        cc->cipher = kstrdup(cipher, GFP_KERNEL);
        if (!cc->cipher)
@@ -1436,13 +1071,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        if (tmp)
                DMWARN("Ignoring unexpected additional cipher options");
-        cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
-                                 __alignof__(struct crypt_cpu));
-        if (!cc->cpu) {
-                ti->error = "Cannot allocate per cpu state";
-                goto bad_mem;
-        }
        /*
         * For compatibility with the original dm-crypt mapping format, if
         * only the cipher name is supplied, use cbc-plain.
@@ -1469,8 +1097,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        }
        /* Allocate cipher */
-        ret = crypt_alloc_tfms(cc, cipher_api);
+        cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
-        if (ret < 0) {
+        if (IS_ERR(cc->tfm)) {
+                ret = PTR_ERR(cc->tfm);
                ti->error = "Error allocating crypto tfm";
                goto bad;
        }
@@ -1483,7 +1112,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
        }
        /* Initialize IV */
-        cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
+        cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm);
        if (cc->iv_size)
                /* at least a 64 bit sector number should fit in our buffer */
                cc->iv_size = max(cc->iv_size,
@@ -1506,15 +1135,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                cc->iv_gen_ops = &crypt_iv_benbi_ops;
        else if (strcmp(ivmode, "null") == 0)
                cc->iv_gen_ops = &crypt_iv_null_ops;
-        else if (strcmp(ivmode, "lmk") == 0) {
+        else {
-                cc->iv_gen_ops = &crypt_iv_lmk_ops;
-                /* Version 2 and 3 is recognised according
-                 * to length of provided multi-key string.
-                 * If present (version 3), last key is used as IV seed.
-                 */
-                if (cc->key_size % cc->key_parts)
-                        cc->key_parts++;
-        } else {
                ret = -EINVAL;
                ti->error = "Invalid IV mode";
                goto bad;
@@ -1560,7 +1181,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        int ret;
        struct dm_arg_set as;
        const char *opt_string;
-        char dummy;
        static struct dm_arg _args[] = {
                {0, 1, "Invalid number of feature args"},
@@ -1593,9 +1213,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
        cc->dmreq_start = sizeof(struct ablkcipher_request);
-        cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
+        cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm);
        cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
-        cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
+        cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) &
                           ~(crypto_tfm_ctx_alignment() - 1);
        cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1604,6 +1224,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                ti->error = "Cannot allocate crypt request mempool";
                goto bad;
        }
+        cc->req = NULL;
        cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
        if (!cc->page_pool) {
@@ -1618,7 +1239,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
        ret = -EINVAL;
-        if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
+        if (sscanf(argv[2], "%llu", &tmpll) != 1) {
                ti->error = "Invalid iv_offset sector";
                goto bad;
        }
@@ -1629,7 +1250,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad;
        }
-        if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
+        if (sscanf(argv[4], "%llu", &tmpll) != 1) {
                ti->error = "Invalid device sector";
                goto bad;
        }
@@ -1660,27 +1281,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
        ret = -ENOMEM;
-        cc->io_queue = alloc_workqueue("kcryptd_io",
+        cc->io_queue = create_singlethread_workqueue("kcryptd_io");
-                                       WQ_NON_REENTRANT|
-                                       WQ_MEM_RECLAIM,
-                                       1);
        if (!cc->io_queue) {
                ti->error = "Couldn't create kcryptd io queue";
                goto bad;
        }
-        cc->crypt_queue = alloc_workqueue("kcryptd",
+        cc->crypt_queue = create_singlethread_workqueue("kcryptd");
-                                          WQ_NON_REENTRANT|
-                                          WQ_CPU_INTENSIVE|
-                                          WQ_MEM_RECLAIM,
-                                          1);
        if (!cc->crypt_queue) {
                ti->error = "Couldn't create kcryptd queue";
                goto bad;
        }
        ti->num_flush_requests = 1;
-        ti->discard_zeroes_data_unsupported = true;
+        ti->discard_zeroes_data_unsupported = 1;
        return 0;
@@ -1689,10 +1303,11 @@ bad:
        return ret;
 }
-static int crypt_map(struct dm_target *ti, struct bio *bio)
+static int crypt_map(struct dm_target *ti, struct bio *bio,
+                     union map_info *map_context)
 {
        struct dm_crypt_io *io;
-        struct crypt_config *cc = ti->private;
+        struct crypt_config *cc;
        /*
         * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
@@ -1700,25 +1315,25 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
         * - for REQ_DISCARD caller must use flush if IO ordering matters
         */
        if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
+                cc = ti->private;
                bio->bi_bdev = cc->dev->bdev;
                if (bio_sectors(bio))
                        bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
                return DM_MAPIO_REMAPPED;
        }
-        io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_sector));
+        io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
-        if (bio_data_dir(io->base_bio) == READ) {
+        if (bio_data_dir(io->base_bio) == READ)
-                if (kcryptd_io_read(io, GFP_NOWAIT))
+                kcryptd_queue_io(io);
-                        kcryptd_queue_io(io);
+        else
-        } else
                kcryptd_queue_crypt(io);
        return DM_MAPIO_SUBMITTED;
 }
 static int crypt_status(struct dm_target *ti, status_type_t type,
-                        unsigned status_flags, char *result, unsigned maxlen)
+                        char *result, unsigned int maxlen)
 {
        struct crypt_config *cc = ti->private;
        unsigned int sz = 0;
@@ -1845,7 +1460,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 static struct target_type crypt_target = {
        .name   = "crypt",
-        .version = {1, 12, 0},
+        .version = {1, 8, 0},
        .module = THIS_MODULE,
        .ctr    = crypt_ctr,
        .dtr    = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index cc1bd048acb..f18375dcedd 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -131,7 +131,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
        struct delay_c *dc;
        unsigned long long tmpll;
-        char dummy;
        if (argc != 3 && argc != 6) {
                ti->error = "requires exactly 3 or 6 arguments";
@@ -146,13 +145,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        dc->reads = dc->writes = 0;
-        if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
+        if (sscanf(argv[1], "%llu", &tmpll) != 1) {
                ti->error = "Invalid device sector";
                goto bad;
        }
        dc->start_read = tmpll;
-        if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) {
+        if (sscanf(argv[2], "%u", &dc->read_delay) != 1) {
                ti->error = "Invalid delay";
                goto bad;
        }
@@ -167,13 +166,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        if (argc == 3)
                goto out;
-        if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
+        if (sscanf(argv[4], "%llu", &tmpll) != 1) {
                ti->error = "Invalid write device sector";
                goto bad_dev_read;
        }
        dc->start_write = tmpll;
-        if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) {
+        if (sscanf(argv[5], "%u", &dc->write_delay) != 1) {
                ti->error = "Invalid write delay";
                goto bad_dev_read;
        }
@@ -274,7 +273,8 @@ static void delay_resume(struct dm_target *ti)
        atomic_set(&dc->may_delay, 1);
 }
-static int delay_map(struct dm_target *ti, struct bio *bio)
+static int delay_map(struct dm_target *ti, struct bio *bio,
+                     union map_info *map_context)
 {
        struct delay_c *dc = ti->private;
@@ -294,7 +294,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
 }
 static int delay_status(struct dm_target *ti, status_type_t type,
-                        unsigned status_flags, char *result, unsigned maxlen)
+                        char *result, unsigned maxlen)
 {
        struct delay_c *dc = ti->private;
        int sz = 0;
@@ -337,7 +337,7 @@ out:
 static struct target_type delay_target = {
        .name        = "delay",
-        .version     = {1, 2, 0},
+        .version     = {1, 1, 0},
        .module      = THIS_MODULE,
        .ctr         = delay_ctr,
        .dtr         = delay_dtr,
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index ebaa4f803ee..0bdb201c2c2 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -11,7 +11,6 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/vmalloc.h>
-#include <linux/module.h>
 #include <linux/slab.h>
 #define DM_MSG_PREFIX "snapshot exception stores"
@@ -142,19 +141,24 @@ EXPORT_SYMBOL(dm_exception_store_type_unregister);
 static int set_chunk_size(struct dm_exception_store *store,
                          const char *chunk_size_arg, char **error)
 {
-        unsigned chunk_size;
+        unsigned long chunk_size_ulong;
+        char *value;
-        if (kstrtouint(chunk_size_arg, 10, &chunk_size)) {
+        chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10);
+        if (*chunk_size_arg == '\0' || *value != '\0' ||
+            chunk_size_ulong > UINT_MAX) {
                *error = "Invalid chunk size";
                return -EINVAL;
        }
-        if (!chunk_size) {
+        if (!chunk_size_ulong) {
                store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
                return 0;
        }
-        return dm_exception_store_set_chunk_size(store, chunk_size, error);
+        return dm_exception_store_set_chunk_size(store,
+                                                 (unsigned) chunk_size_ulong,
+                                                 error);
 }
 int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
@@ -278,7 +282,7 @@ int dm_exception_store_init(void)
        return 0;
 persistent_fail:
-        dm_transient_snapshot_exit();
+        dm_persistent_snapshot_exit();
 transient_fail:
        return r;
 }
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 9721f2ffb1a..f84c08029b2 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -39,10 +39,6 @@ enum feature_flag_bits {
        DROP_WRITES
 };
-struct per_bio_data {
-        bool bio_submitted;
-};
 static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
                          struct dm_target *ti)
 {
@@ -164,7 +160,6 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        unsigned long long tmpll;
        struct dm_arg_set as;
        const char *devname;
-        char dummy;
        as.argc = argc;
        as.argv = argv;
@@ -183,7 +178,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        devname = dm_shift_arg(&as);
-        if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) {
+        if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) {
                ti->error = "Invalid device sector";
                goto bad;
        }
@@ -218,7 +213,6 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->num_flush_requests = 1;
        ti->num_discard_requests = 1;
-        ti->per_bio_data_size = sizeof(struct per_bio_data);
        ti->private = fc;
        return 0;
@@ -270,12 +264,11 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
        }
 }
-static int flakey_map(struct dm_target *ti, struct bio *bio)
+static int flakey_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
 {
        struct flakey_c *fc = ti->private;
        unsigned elapsed;
-        struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
-        pb->bio_submitted = false;
        /* Are we alive ? */
        elapsed = (jiffies - fc->start_time) / HZ;
@@ -283,7 +276,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
                /*
                 * Flag this bio as submitted while down.
                 */
-                pb->bio_submitted = true;
+                map_context->ll = 1;
                /*
                 * Map reads as normal.
@@ -320,16 +313,17 @@ map_bio:
        return DM_MAPIO_REMAPPED;
 }
-static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int flakey_end_io(struct dm_target *ti, struct bio *bio,
+                         int error, union map_info *map_context)
 {
        struct flakey_c *fc = ti->private;
-        struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+        unsigned bio_submitted_while_down = map_context->ll;
        /*
         * Corrupt successful READs while in down state.
         * If flags were specified, only corrupt those that match.
         */
-        if (fc->corrupt_bio_byte && !error && pb->bio_submitted &&
+        if (!error && bio_submitted_while_down &&
            (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
            all_corrupt_bio_flags_match(bio, fc))
                corrupt_bio_data(bio, fc);
@@ -338,7 +332,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
 }
 static int flakey_status(struct dm_target *ti, status_type_t type,
-                         unsigned status_flags, char *result, unsigned maxlen)
+                         char *result, unsigned int maxlen)
 {
        unsigned sz = 0;
        struct flakey_c *fc = ti->private;
@@ -374,17 +368,8 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
 static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
 {
        struct flakey_c *fc = ti->private;
-        struct dm_dev *dev = fc->dev;
-        int r = 0;
-        /*
-         * Only pass ioctls through if the device sizes match exactly.
-         */
-        if (fc->start ||
-            ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
-                r = scsi_verify_blk_ioctl(NULL, cmd);
-        return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
+        return __blkdev_driver_ioctl(fc->dev->bdev, fc->dev->mode, cmd, arg);
 }
 static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
@@ -411,7 +396,7 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
 static struct target_type flakey_target = {
        .name   = "flakey",
-        .version = {1, 3, 0},
+        .version = {1, 2, 0},
        .module = THIS_MODULE,
        .ctr    = flakey_ctr,
        .dtr    = flakey_dtr,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea49834377c..ad2eba40e31 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -249,6 +249,16 @@ static void vm_dp_init(struct dpages *dp, void *data)
        dp->context_ptr = data;
 }
+static void dm_bio_destructor(struct bio *bio)
+{
+        unsigned region;
+        struct io *io;
+        retrieve_io_and_region_from_bio(bio, &io, &region);
+        bio_free(bio, io->client->bios);
+}
 /*
 * Functions for getting the pages from kernel memory.
 */
@@ -286,9 +296,6 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
        unsigned offset;
        unsigned num_bvecs;
        sector_t remaining = where->count;
-        struct request_queue *q = bdev_get_queue(where->bdev);
-        unsigned short logical_block_size = queue_logical_block_size(q);
-        sector_t num_sectors;
        /*
         * where->count may be zero if rw holds a flush and we need to
@@ -298,38 +305,20 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
                /*
                 * Allocate a suitably sized-bio.
                 */
-                if ((rw & REQ_DISCARD) || (rw & REQ_WRITE_SAME))
+                num_bvecs = dm_sector_div_up(remaining,
-                        num_bvecs = 1;
+                                             (PAGE_SIZE >> SECTOR_SHIFT));
-                else
+                num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
-                        num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev),
-                                          dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
                bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
                bio->bi_sector = where->sector + (where->count - remaining);
                bio->bi_bdev = where->bdev;
                bio->bi_end_io = endio;
+                bio->bi_destructor = dm_bio_destructor;
                store_io_and_region_in_bio(bio, io, region);
-                if (rw & REQ_DISCARD) {
+                /*
-                        num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
+                 * Try and add as many pages as possible.
-                        bio->bi_size = num_sectors << SECTOR_SHIFT;
+                 */
-                        remaining -= num_sectors;
+                while (remaining) {
-                } else if (rw & REQ_WRITE_SAME) {
-                        /*
-                         * WRITE SAME only uses a single page.
-                         */
-                        dp->get_page(dp, &page, &len, &offset);
-                        bio_add_page(bio, page, logical_block_size, offset);
-                        num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining);
-                        bio->bi_size = num_sectors << SECTOR_SHIFT;
-                        offset = 0;
-                        remaining -= num_sectors;
-                        dp->next_page(dp);
-                } else while (remaining) {
-                        /*
-                         * Try and add as many pages as possible.
-                         */
                        dp->get_page(dp, &page, &len, &offset);
                        len = min(len, to_bytes(remaining));
                        if (!bio_add_page(bio, page, len, offset))
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 0666b5d14b8..2e9a3ca37bd 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -880,7 +880,6 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
        struct hd_geometry geometry;
        unsigned long indata[4];
        char *geostr = (char *) param + param->data_start;
-        char dummy;
        md = find_device(param);
        if (!md)
@@ -892,8 +891,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
                goto out;
        }
-        x = sscanf(geostr, "%lu %lu %lu %lu%c", indata,
+        x = sscanf(geostr, "%lu %lu %lu %lu", indata,
-                   indata + 1, indata + 2, indata + 3, &dummy);
+                   indata + 1, indata + 2, indata + 3);
        if (x != 4) {
                DMWARN("Unable to interpret geometry settings.");
@@ -1054,7 +1053,6 @@ static void retrieve_status(struct dm_table *table,
        char *outbuf, *outptr;
        status_type_t type;
        size_t remaining, len, used = 0;
-        unsigned status_flags = 0;
        outptr = outbuf = get_result_buffer(param, param_size, &len);
@@ -1091,9 +1089,7 @@ static void retrieve_status(struct dm_table *table,
                /* Get the status/table string from the target driver */
                if (ti->type->status) {
-                        if (param->flags & DM_NOFLUSH_FLAG)
+                        if (ti->type->status(ti, type, outptr, remaining)) {
-                                status_flags |= DM_STATUS_NOFLUSH_FLAG;
-                        if (ti->type->status(ti, type, status_flags, outptr, remaining)) {
                                param->flags |= DM_BUFFER_FULL_FLAG;
                                break;
                        }
@@ -1219,7 +1215,6 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
        struct hash_cell *hc;
        struct dm_table *t;
        struct mapped_device *md;
-        struct target_type *immutable_target_type;
        md = find_device(param);
        if (!md)
@@ -1235,16 +1230,6 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
                goto out;
        }
-        immutable_target_type = dm_get_immutable_target_type(md);
-        if (immutable_target_type &&
-            (immutable_target_type != dm_table_get_immutable_target_type(t))) {
-                DMWARN("can't replace immutable target type %s",
-                       immutable_target_type->name);
-                dm_table_destroy(t);
-                r = -EINVAL;
-                goto out;
-        }
        /* Protect md->type and md->queue against concurrent table loads. */
        dm_lock_md_type(md);
        if (dm_get_md_type(md) == DM_TYPE_NONE)
@@ -1441,7 +1426,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
        if (!argc) {
                DMWARN("Empty message received.");
-                goto out_argv;
+                goto out;
        }
        table = dm_get_live_table(md);
@@ -1543,21 +1528,7 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
        return r;
 }
-#define DM_PARAMS_VMALLOC       0x0001  /* Params alloced with vmalloc not kmalloc */
+static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
-#define DM_WIPE_BUFFER          0x0010  /* Wipe input buffer before returning from ioctl */
-static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags)
-{
-        if (param_flags & DM_WIPE_BUFFER)
-                memset(param, 0, param_size);
-        if (param_flags & DM_PARAMS_VMALLOC)
-                vfree(param);
-        else
-                kfree(param);
-}
-static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, int *param_flags)
 {
        struct dm_ioctl tmp, *dmi;
        int secure_data;
@@ -1570,21 +1541,7 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, in
        secure_data = tmp.flags & DM_SECURE_DATA_FLAG;
-        *param_flags = secure_data ? DM_WIPE_BUFFER : 0;
+        dmi = vmalloc(tmp.data_size);
-        /*
-         * Try to avoid low memory issues when a device is suspended.
-         * Use kmalloc() rather than vmalloc() when we can.
-         */
-        dmi = NULL;
-        if (tmp.data_size <= KMALLOC_MAX_SIZE)
-                dmi = kmalloc(tmp.data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
-        if (!dmi) {
-                dmi = __vmalloc(tmp.data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL);
-                *param_flags |= DM_PARAMS_VMALLOC;
-        }
        if (!dmi) {
                if (secure_data && clear_user(user, tmp.data_size))
                        return -EFAULT;
@@ -1594,14 +1551,6 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, in
        if (copy_from_user(dmi, user, tmp.data_size))
                goto bad;
-        /*
-         * Abort if something changed the ioctl data while it was being copied.
-         */
-        if (dmi->data_size != tmp.data_size) {
-                DMERR("rejecting ioctl: data size modified while processing parameters");
-                goto bad;
-        }
        /* Wipe the user buffer so we do not return it to userspace */
        if (secure_data && clear_user(user, tmp.data_size))
                goto bad;
@@ -1610,8 +1559,9 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, in
        return 0;
 bad:
-        free_params(dmi, tmp.data_size, *param_flags);
+        if (secure_data)
+                memset(dmi, 0, tmp.data_size);
+        vfree(dmi);
        return -EFAULT;
 }
@@ -1648,7 +1598,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 {
        int r = 0;
-        int param_flags;
+        int wipe_buffer;
        unsigned int cmd;
        struct dm_ioctl *uninitialized_var(param);
        ioctl_fn fn = NULL;
@@ -1684,14 +1634,24 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
        }
        /*
+         * Trying to avoid low memory issues when a device is
+         * suspended.
+         */
+        current->flags |= PF_MEMALLOC;
+        /*
         * Copy the parameters into kernel space.
         */
-        r = copy_params(user, &param, &param_flags);
+        r = copy_params(user, &param);
+        current->flags &= ~PF_MEMALLOC;
        if (r)
                return r;
        input_param_size = param->data_size;
+        wipe_buffer = param->flags & DM_SECURE_DATA_FLAG;
        r = validate_params(cmd, param);
        if (r)
                goto out;
@@ -1706,7 +1666,10 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
                r = -EFAULT;
 out:
-        free_params(param, input_param_size, param_flags);
+        if (wipe_buffer)
+                memset(param, 0, input_param_size);
+        vfree(param);
        return r;
 }
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 68c02673263..32ac70861d6 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -66,8 +66,6 @@ struct dm_kcopyd_client {
        struct list_head pages_jobs;
 };
-static struct page_list zero_page_list;
 static void wake(struct dm_kcopyd_client *kc)
 {
        queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
@@ -256,9 +254,6 @@ int __init dm_kcopyd_init(void)
        if (!_job_cache)
                return -ENOMEM;
-        zero_page_list.next = &zero_page_list;
-        zero_page_list.page = ZERO_PAGE(0);
        return 0;
 }
@@ -327,7 +322,7 @@ static int run_complete_job(struct kcopyd_job *job)
        dm_kcopyd_notify_fn fn = job->fn;
        struct dm_kcopyd_client *kc = job->kc;
-        if (job->pages && job->pages != &zero_page_list)
+        if (job->pages)
                kcopyd_put_pages(kc, job->pages);
        /*
         * If this is the master job, the sub jobs have already
@@ -349,7 +344,7 @@ static void complete_io(unsigned long error, void *context)
        struct dm_kcopyd_client *kc = job->kc;
        if (error) {
-                if (job->rw & WRITE)
+                if (job->rw == WRITE)
                        job->write_err |= error;
                else
                        job->read_err = 1;
@@ -361,7 +356,7 @@ static void complete_io(unsigned long error, void *context)
                }
        }
-        if (job->rw & WRITE)
+        if (job->rw == WRITE)
                push(&kc->complete_jobs, job);
        else {
@@ -432,7 +427,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
                if (r < 0) {
                        /* error this rogue job */
-                        if (job->rw & WRITE)
+                        if (job->rw == WRITE)
                                job->write_err = (unsigned long) -1L;
                        else
                                job->read_err = 1;
@@ -489,8 +484,6 @@ static void dispatch_job(struct kcopyd_job *job)
        atomic_inc(&kc->nr_jobs);
        if (unlikely(!job->source.count))
                push(&kc->complete_jobs, job);
-        else if (job->pages == &zero_page_list)
-                push(&kc->io_jobs, job);
        else
                push(&kc->pages_jobs, job);
        wake(kc);
@@ -585,7 +578,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
                   unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
 {
        struct kcopyd_job *job;
-        int i;
        /*
         * Allocate an array of jobs consisting of one master job
@@ -600,29 +592,14 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
        job->flags = flags;
        job->read_err = 0;
        job->write_err = 0;
+        job->rw = READ;
+        job->source = *from;
        job->num_dests = num_dests;
        memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
-        if (from) {
+        job->pages = NULL;
-                job->source = *from;
-                job->pages = NULL;
-                job->rw = READ;
-        } else {
-                memset(&job->source, 0, sizeof job->source);
-                job->source.count = job->dests[0].count;
-                job->pages = &zero_page_list;
-                /*
-                 * Use WRITE SAME to optimize zeroing if all dests support it.
-                 */
-                job->rw = WRITE | REQ_WRITE_SAME;
-                for (i = 0; i < job->num_dests; i++)
-                        if (!bdev_write_same(job->dests[i].bdev)) {
-                                job->rw = WRITE;
-                                break;
-                        }
-        }
        job->fn = fn;
        job->context = context;
@@ -640,14 +617,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 }
 EXPORT_SYMBOL(dm_kcopyd_copy);
-int dm_kcopyd_zero(struct dm_kcopyd_client *kc,
-                   unsigned num_dests, struct dm_io_region *dests,
-                   unsigned flags, dm_kcopyd_notify_fn fn, void *context)
-{
-        return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
-}
-EXPORT_SYMBOL(dm_kcopyd_zero);
 void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
                                 dm_kcopyd_notify_fn fn, void *context)
 {
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 328cad5617a..3921e3bb43c 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -29,7 +29,6 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
        struct linear_c *lc;
        unsigned long long tmp;
-        char dummy;
        if (argc != 2) {
                ti->error = "Invalid argument count";
@@ -42,7 +41,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                return -ENOMEM;
        }
-        if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) {
+        if (sscanf(argv[1], "%llu", &tmp) != 1) {
                ti->error = "dm-linear: Invalid device sector";
                goto bad;
        }
@@ -55,7 +54,6 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->num_flush_requests = 1;
        ti->num_discard_requests = 1;
-        ti->num_write_same_requests = 1;
        ti->private = lc;
        return 0;
@@ -88,7 +86,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
                bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
 }
-static int linear_map(struct dm_target *ti, struct bio *bio)
+static int linear_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
 {
        linear_map_bio(ti, bio);
@@ -96,7 +95,7 @@ static int linear_map(struct dm_target *ti, struct bio *bio)
 }
 static int linear_status(struct dm_target *ti, status_type_t type,
-                         unsigned status_flags, char *result, unsigned maxlen)
+                         char *result, unsigned int maxlen)
 {
        struct linear_c *lc = (struct linear_c *) ti->private;
@@ -117,17 +116,7 @@ static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
                        unsigned long arg)
 {
        struct linear_c *lc = (struct linear_c *) ti->private;
-        struct dm_dev *dev = lc->dev;
+        return __blkdev_driver_ioctl(lc->dev->bdev, lc->dev->mode, cmd, arg);
-        int r = 0;
-        /*
-         * Only pass ioctls through if the device sizes match exactly.
-         */
-        if (lc->start ||
-            ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
-                r = scsi_verify_blk_ioctl(NULL, cmd);
-        return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
 }
 static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
@@ -155,7 +144,7 @@ static int linear_iterate_devices(struct dm_target *ti,
 static struct target_type linear_target = {
        .name   = "linear",
-        .version = {1, 2, 0},
+        .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr    = linear_ctr,
        .dtr    = linear_dtr,
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 9429159d9ee..1021c898601 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -9,7 +9,6 @@
 #include <linux/dm-dirty-log.h>
 #include <linux/device-mapper.h>
 #include <linux/dm-log-userspace.h>
-#include <linux/module.h>
 #include "dm-log-userspace-transfer.h"
@@ -31,7 +30,6 @@ struct flush_entry {
 struct log_c {
        struct dm_target *ti;
-        struct dm_dev *log_dev;
        uint32_t region_size;
        region_t region_count;
        uint64_t luid;
@@ -148,7 +146,7 @@ static int build_constructor_string(struct dm_target *ti,
 *      <UUID> <other args>
 * Where 'other args' is the userspace implementation specific log
 * arguments.  An example might be:
- *      <UUID> clustered-disk <arg count> <log dev> <region_size> [[no]sync]
+ *      <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
 *
 * So, this module will strip off the <UUID> for identification purposes
 * when communicating with userspace about a log; but will pass on everything
@@ -163,15 +161,13 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
        struct log_c *lc = NULL;
        uint64_t rdata;
        size_t rdata_size = sizeof(rdata);
-        char *devices_rdata = NULL;
-        size_t devices_rdata_size = DM_NAME_LEN;
        if (argc < 3) {
                DMWARN("Too few arguments to userspace dirty log");
                return -EINVAL;
        }
-        lc = kzalloc(sizeof(*lc), GFP_KERNEL);
+        lc = kmalloc(sizeof(*lc), GFP_KERNEL);
        if (!lc) {
                DMWARN("Unable to allocate userspace log context.");
                return -ENOMEM;
@@ -199,19 +195,9 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
                return str_size;
        }
-        devices_rdata = kzalloc(devices_rdata_size, GFP_KERNEL);
+        /* Send table string */
-        if (!devices_rdata) {
-                DMERR("Failed to allocate memory for device information");
-                r = -ENOMEM;
-                goto out;
-        }
-        /*
-         * Send table string and get back any opened device.
-         */
        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
-                                 ctr_str, str_size,
+                                 ctr_str, str_size, NULL, NULL);
-                                 devices_rdata, &devices_rdata_size);
        if (r < 0) {
                if (r == -ESRCH)
@@ -234,20 +220,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
        lc->region_size = (uint32_t)rdata;
        lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
-        if (devices_rdata_size) {
-                if (devices_rdata[devices_rdata_size - 1] != '\0') {
-                        DMERR("DM_ULOG_CTR device return string not properly terminated");
-                        r = -EINVAL;
-                        goto out;
-                }
-                r = dm_get_device(ti, devices_rdata,
-                                  dm_table_get_mode(ti->table), &lc->log_dev);
-                if (r)
-                        DMERR("Failed to register %s with device-mapper",
-                              devices_rdata);
-        }
 out:
-        kfree(devices_rdata);
        if (r) {
                kfree(lc);
                kfree(ctr_str);
@@ -268,9 +241,6 @@ static void userspace_dtr(struct dm_dirty_log *log)
                                 NULL, 0,
                                 NULL, NULL);
-        if (lc->log_dev)
-                dm_put_device(lc->ti, lc->log_dev);
        kfree(lc->usr_argv_str);
        kfree(lc);
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 08d9a207259..1f23e048f07 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -134,7 +134,7 @@ static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
 {
        struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
-        if (!capable(CAP_SYS_ADMIN))
+        if (!cap_raised(current_cap(), CAP_SYS_ADMIN))
                return;
        spin_lock(&receiving_list_lock);
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 627d19186d5..3b52bb72bd1 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -369,7 +369,6 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
        unsigned int region_count;
        size_t bitset_size, buf_size;
        int r;
-        char dummy;
        if (argc < 1 || argc > 2) {
                DMWARN("wrong number of arguments to dirty region log");
@@ -388,7 +387,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                }
        }
-        if (sscanf(argv[0], "%u%c", &region_size, &dummy) != 1 ||
+        if (sscanf(argv[0], "%u", &region_size) != 1 ||
            !_check_region_size(ti, region_size)) {
                DMWARN("invalid region size %s", argv[0]);
                return -EINVAL;
@@ -571,6 +570,16 @@ static void disk_dtr(struct dm_dirty_log *log)
        destroy_log_context(lc);
 }
+static int count_bits32(uint32_t *addr, unsigned size)
+{
+        int count = 0, i;
+        for (i = 0; i < size; i++) {
+                count += hweight32(*(addr+i));
+        }
+        return count;
+}
 static void fail_log_device(struct log_c *lc)
 {
        if (lc->log_dev_failed)
@@ -619,8 +628,7 @@ static int disk_resume(struct dm_dirty_log *log)
        /* copy clean across to sync */
        memcpy(lc->sync_bits, lc->clean_bits, size);
-        lc->sync_count = memweight(lc->clean_bits,
+        lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
-                                lc->bitset_uint32_count * sizeof(uint32_t));
        lc->sync_search = 0;
        /* set the correct number of regions in the header */
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 573bd04591b..5e0090ef418 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -18,7 +18,6 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/workqueue.h>
-#include <linux/delay.h>
 #include <scsi/scsi_dh.h>
 #include <linux/atomic.h>
@@ -62,11 +61,11 @@ struct multipath {
        struct list_head list;
        struct dm_target *ti;
+        spinlock_t lock;
        const char *hw_handler_name;
        char *hw_handler_params;
-        spinlock_t lock;
        unsigned nr_priority_groups;
        struct list_head priority_groups;
@@ -82,18 +81,16 @@ struct multipath {
        struct priority_group *next_pg; /* Switch to this PG if set */
        unsigned repeat_count;          /* I/Os left before calling PS again */
-        unsigned queue_io:1;            /* Must we queue all I/O? */
+        unsigned queue_io;              /* Must we queue all I/O? */
-        unsigned queue_if_no_path:1;    /* Queue I/O if last path fails? */
+        unsigned queue_if_no_path;      /* Queue I/O if last path fails? */
-        unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
+        unsigned saved_queue_if_no_path;/* Saved state during suspension */
-        unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
        unsigned pg_init_retries;       /* Number of times to retry pg_init */
        unsigned pg_init_count;         /* Number of times pg_init called */
        unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
-        unsigned queue_size;
        struct work_struct process_queued_ios;
        struct list_head queued_ios;
+        unsigned queue_size;
        struct work_struct trigger_event;
@@ -229,27 +226,6 @@ static void free_multipath(struct multipath *m)
        kfree(m);
 }
-static int set_mapinfo(struct multipath *m, union map_info *info)
-{
-        struct dm_mpath_io *mpio;
-        mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
-        if (!mpio)
-                return -ENOMEM;
-        memset(mpio, 0, sizeof(*mpio));
-        info->ptr = mpio;
-        return 0;
-}
-static void clear_mapinfo(struct multipath *m, union map_info *info)
-{
-        struct dm_mpath_io *mpio = info->ptr;
-        info->ptr = NULL;
-        mempool_free(mpio, m->mpio_pool);
-}
 /*-----------------------------------------------
 * Path selection
@@ -331,18 +307,14 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
        /*
         * Loop through priority groups until we find a valid path.
         * First time we skip PGs marked 'bypassed'.
-         * Second time we only try the ones we skipped, but set
+         * Second time we only try the ones we skipped.
-         * pg_init_delay_retry so we do not hammer controllers.
         */
        do {
                list_for_each_entry(pg, &m->priority_groups, list) {
                        if (pg->bypassed == bypassed)
                                continue;
-                        if (!__choose_path_in_pg(m, pg, nr_bytes)) {
+                        if (!__choose_path_in_pg(m, pg, nr_bytes))
-                                if (!bypassed)
-                                        m->pg_init_delay_retry = 1;
                                return;
-                        }
                }
        } while (bypassed--);
@@ -369,14 +341,13 @@ static int __must_push_back(struct multipath *m)
 }
 static int map_io(struct multipath *m, struct request *clone,
-                  union map_info *map_context, unsigned was_queued)
+                  struct dm_mpath_io *mpio, unsigned was_queued)
 {
        int r = DM_MAPIO_REMAPPED;
        size_t nr_bytes = blk_rq_bytes(clone);
        unsigned long flags;
        struct pgpath *pgpath;
        struct block_device *bdev;
-        struct dm_mpath_io *mpio = map_context->ptr;
        spin_lock_irqsave(&m->lock, flags);
@@ -452,6 +423,7 @@ static void dispatch_queued_ios(struct multipath *m)
 {
        int r;
        unsigned long flags;
+        struct dm_mpath_io *mpio;
        union map_info *info;
        struct request *clone, *n;
        LIST_HEAD(cl);
@@ -464,15 +436,16 @@ static void dispatch_queued_ios(struct multipath *m)
                list_del_init(&clone->queuelist);
                info = dm_get_rq_mapinfo(clone);
+                mpio = info->ptr;
-                r = map_io(m, clone, info, 1);
+                r = map_io(m, clone, mpio, 1);
                if (r < 0) {
-                        clear_mapinfo(m, info);
+                        mempool_free(mpio, m->mpio_pool);
                        dm_kill_unmapped_request(clone, r);
                } else if (r == DM_MAPIO_REMAPPED)
                        dm_dispatch_request(clone);
                else if (r == DM_MAPIO_REQUEUE) {
-                        clear_mapinfo(m, info);
+                        mempool_free(mpio, m->mpio_pool);
                        dm_requeue_unmapped_request(clone);
                }
        }
@@ -488,6 +461,9 @@ static void process_queued_ios(struct work_struct *work)
        spin_lock_irqsave(&m->lock, flags);
+        if (!m->queue_size)
+                goto out;
        if (!m->current_pgpath)
                __choose_pgpath(m, 0);
@@ -500,6 +476,7 @@ static void process_queued_ios(struct work_struct *work)
        if (m->pg_init_required && !m->pg_init_in_progress && pgpath)
                __pg_init_all_paths(m);
+out:
        spin_unlock_irqrestore(&m->lock, flags);
        if (!must_queue)
                dispatch_queued_ios(m);
@@ -569,8 +546,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
        int r;
        struct pgpath *p;
        struct multipath *m = ti->private;
-        struct request_queue *q = NULL;
-        const char *attached_handler_name;
        /* we need at least a path arg */
        if (as->argc < 1) {
@@ -589,37 +564,13 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
                goto bad;
        }
-        if (m->retain_attached_hw_handler || m->hw_handler_name)
-                q = bdev_get_queue(p->path.dev->bdev);
-        if (m->retain_attached_hw_handler) {
-                attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
-                if (attached_handler_name) {
-                        /*
-                         * Reset hw_handler_name to match the attached handler
-                         * and clear any hw_handler_params associated with the
-                         * ignored handler.
-                         *
-                         * NB. This modifies the table line to show the actual
-                         * handler instead of the original table passed in.
-                         */
-                        kfree(m->hw_handler_name);
-                        m->hw_handler_name = attached_handler_name;
-                        kfree(m->hw_handler_params);
-                        m->hw_handler_params = NULL;
-                }
-        }
        if (m->hw_handler_name) {
-                /*
+                struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
-                 * Increments scsi_dh reference, even when using an
-                 * already-attached handler.
-                 */
                r = scsi_dh_attach(q, m->hw_handler_name);
                if (r == -EBUSY) {
                        /*
-                         * Already attached to different hw_handler:
+                         * Already attached to different hw_handler,
                         * try to reattach with correct one.
                         */
                        scsi_dh_detach(q);
@@ -747,8 +698,8 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
                return 0;
        m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
-        if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name),
+        request_module("scsi_dh_%s", m->hw_handler_name);
-                                     "scsi_dh_%s", m->hw_handler_name)) {
+        if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
                ti->error = "unknown hardware handler type";
                ret = -EINVAL;
                goto fail;
@@ -787,7 +738,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
        const char *arg_name;
        static struct dm_arg _args[] = {
-                {0, 6, "invalid number of feature args"},
+                {0, 5, "invalid number of feature args"},
                {1, 50, "pg_init_retries must be between 1 and 50"},
                {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
        };
@@ -808,11 +759,6 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
                        continue;
                }
-                if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
-                        m->retain_attached_hw_handler = 1;
-                        continue;
-                }
                if (!strcasecmp(arg_name, "pg_init_retries") &&
                    (argc >= 1)) {
                        r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
@@ -944,7 +890,7 @@ static void flush_multipath_work(struct multipath *m)
        flush_workqueue(kmpath_handlerd);
        multipath_wait_for_pg_init_completion(m);
        flush_workqueue(kmultipathd);
-        flush_work(&m->trigger_event);
+        flush_work_sync(&m->trigger_event);
 }
 static void multipath_dtr(struct dm_target *ti)
@@ -962,16 +908,20 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
                         union map_info *map_context)
 {
        int r;
+        struct dm_mpath_io *mpio;
        struct multipath *m = (struct multipath *) ti->private;
-        if (set_mapinfo(m, map_context) < 0)
+        mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
+        if (!mpio)
                /* ENOMEM, requeue */
                return DM_MAPIO_REQUEUE;
+        memset(mpio, 0, sizeof(*mpio));
+        map_context->ptr = mpio;
        clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-        r = map_io(m, clone, map_context, 0);
+        r = map_io(m, clone, mpio, 0);
        if (r < 0 || r == DM_MAPIO_REQUEUE)
-                clear_mapinfo(m, map_context);
+                mempool_free(mpio, m->mpio_pool);
        return r;
 }
@@ -1104,9 +1054,8 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
        struct priority_group *pg;
        unsigned pgnum;
        unsigned long flags;
-        char dummy;
-        if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
+        if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
            (pgnum > m->nr_priority_groups)) {
                DMWARN("invalid PG number supplied to switch_pg_num");
                return -EINVAL;
@@ -1136,9 +1085,8 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
 {
        struct priority_group *pg;
        unsigned pgnum;
-        char dummy;
-        if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
+        if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
            (pgnum > m->nr_priority_groups)) {
                DMWARN("invalid PG number supplied to bypass_pg");
                return -EINVAL;
@@ -1309,20 +1257,17 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 {
        struct multipath *m = ti->private;
        struct dm_mpath_io *mpio = map_context->ptr;
-        struct pgpath *pgpath;
+        struct pgpath *pgpath = mpio->pgpath;
        struct path_selector *ps;
        int r;
-        BUG_ON(!mpio);
        r  = do_end_io(m, clone, error, mpio);
-        pgpath = mpio->pgpath;
        if (pgpath) {
                ps = &pgpath->pg->ps;
                if (ps->type->end_io)
                        ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
        }
-        clear_mapinfo(m, map_context);
+        mempool_free(mpio, m->mpio_pool);
        return r;
 }
@@ -1379,7 +1324,7 @@ static void multipath_resume(struct dm_target *ti)
 *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
 */
 static int multipath_status(struct dm_target *ti, status_type_t type,
-                            unsigned status_flags, char *result, unsigned maxlen)
+                            char *result, unsigned int maxlen)
 {
        int sz = 0;
        unsigned long flags;
@@ -1397,16 +1342,13 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
        else {
                DMEMIT("%u ", m->queue_if_no_path +
                              (m->pg_init_retries > 0) * 2 +
-                              (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
+                              (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
-                              m->retain_attached_hw_handler);
                if (m->queue_if_no_path)
                        DMEMIT("queue_if_no_path ");
                if (m->pg_init_retries)
                        DMEMIT("pg_init_retries %u ", m->pg_init_retries);
                if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
                        DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
-                if (m->retain_attached_hw_handler)
-                        DMEMIT("retain_attached_hw_handler ");
        }
        if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1555,49 +1497,29 @@ out:
 static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
                           unsigned long arg)
 {
-        struct multipath *m = ti->private;
+        struct multipath *m = (struct multipath *) ti->private;
-        struct pgpath *pgpath;
+        struct block_device *bdev = NULL;
-        struct block_device *bdev;
+        fmode_t mode = 0;
-        fmode_t mode;
        unsigned long flags;
-        int r;
+        int r = 0;
-again:
-        bdev = NULL;
-        mode = 0;
-        r = 0;
        spin_lock_irqsave(&m->lock, flags);
        if (!m->current_pgpath)
                __choose_pgpath(m, 0);
-        pgpath = m->current_pgpath;
+        if (m->current_pgpath) {
+                bdev = m->current_pgpath->path.dev->bdev;
-        if (pgpath) {
+                mode = m->current_pgpath->path.dev->mode;
-                bdev = pgpath->path.dev->bdev;
-                mode = pgpath->path.dev->mode;
        }
-        if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
+        if (m->queue_io)
                r = -EAGAIN;
        else if (!bdev)
                r = -EIO;
        spin_unlock_irqrestore(&m->lock, flags);
-        /*
-         * Only pass ioctls through if the device sizes match exactly.
-         */
-        if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
-                r = scsi_verify_blk_ioctl(NULL, cmd);
-        if (r == -EAGAIN && !fatal_signal_pending(current)) {
-                queue_work(kmultipathd, &m->process_queued_ios);
-                msleep(10);
-                goto again;
-        }
        return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 }
@@ -1695,7 +1617,7 @@ out:
 *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
        .name = "multipath",
-        .version = {1, 5, 0},
+        .version = {1, 3, 0},
        .module = THIS_MODULE,
        .ctr = multipath_ctr,
        .dtr = multipath_dtr,
diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c
index fa0ccc585cb..42c04f04a0c 100644
--- a/drivers/md/dm-path-selector.c
+++ b/drivers/md/dm-path-selector.c
@@ -10,7 +10,6 @@
 */
 #include <linux/device-mapper.h>
-#include <linux/module.h>
 #include "dm-path-selector.h"
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 3941fae0de9..03a837aa5ce 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -112,7 +112,6 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
        struct selector *s = ps->context;
        struct path_info *pi;
        unsigned repeat_count = QL_MIN_IO;
-        char dummy;
        /*
         * Arguments: [<repeat_count>]
@@ -124,7 +123,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
                return -EINVAL;
        }
-        if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
+        if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
                *error = "queue-length ps: invalid repeat count";
                return -EINVAL;
        }
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 3d8984edeff..86df8b2cf92 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -6,12 +6,10 @@
 */
 #include <linux/slab.h>
-#include <linux/module.h>
 #include "md.h"
 #include "raid1.h"
 #include "raid5.h"
-#include "raid10.h"
 #include "bitmap.h"
 #include <linux/device-mapper.h>
@@ -39,7 +37,7 @@ struct raid_dev {
         */
        struct dm_dev *meta_dev;
        struct dm_dev *data_dev;
-        struct md_rdev rdev;
+        struct mdk_rdev_s rdev;
 };
 /*
@@ -53,17 +51,13 @@ struct raid_dev {
 #define DMPF_MAX_RECOVERY_RATE 0x20
 #define DMPF_MAX_WRITE_BEHIND  0x40
 #define DMPF_STRIPE_CACHE      0x80
-#define DMPF_REGION_SIZE       0x100
+#define DMPF_REGION_SIZE       0X100
-#define DMPF_RAID10_COPIES     0x200
-#define DMPF_RAID10_FORMAT     0x400
 struct raid_set {
        struct dm_target *ti;
-        uint32_t bitmap_loaded;
+        uint64_t print_flags;
-        uint32_t print_flags;
-        struct mddev md;
+        struct mddev_s md;
        struct raid_type *raid_type;
        struct dm_target_callbacks callbacks;
@@ -80,7 +74,6 @@ static struct raid_type {
        const unsigned algorithm;       /* RAID algorithm. */
 } raid_types[] = {
        {"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
-        {"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, UINT_MAX /* Varies */},
        {"raid4",    "RAID4 (dedicated parity disk)",   1, 2, 5, ALGORITHM_PARITY_0},
        {"raid5_la", "RAID5 (left asymmetric)",         1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
        {"raid5_ra", "RAID5 (right asymmetric)",        1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -91,17 +84,6 @@ static struct raid_type {
        {"raid6_nc", "RAID6 (N continue)",              2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
 };
-static unsigned raid10_md_layout_to_copies(int layout)
-{
-        return layout & 0xFF;
-}
-static int raid10_format_to_md_layout(char *format, unsigned copies)
-{
-        /* 1 "far" copy, and 'copies' "near" copies */
-        return (1 << 8) | (copies & 0xFF);
-}
 static struct raid_type *get_raid_type(char *name)
 {
        int i;
@@ -117,12 +99,20 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
 {
        unsigned i;
        struct raid_set *rs;
+        sector_t sectors_per_dev;
        if (raid_devs <= raid_type->parity_devs) {
                ti->error = "Insufficient number of devices";
                return ERR_PTR(-EINVAL);
        }
+        sectors_per_dev = ti->len;
+        if ((raid_type->level > 1) &&
+            sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+                ti->error = "Target length not divisible by number of data devices";
+                return ERR_PTR(-EINVAL);
+        }
        rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
        if (!rs) {
                ti->error = "Cannot allocate raid context";
@@ -136,6 +126,7 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
        rs->md.raid_disks = raid_devs;
        rs->md.level = raid_type->level;
        rs->md.new_level = rs->md.level;
+        rs->md.dev_sectors = sectors_per_dev;
        rs->md.layout = raid_type->algorithm;
        rs->md.new_layout = rs->md.layout;
        rs->md.delta_disks = 0;
@@ -150,7 +141,6 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
         *  rs->md.external
         *  rs->md.chunk_sectors
         *  rs->md.new_chunk_sectors
-         *  rs->md.dev_sectors
         */
        return rs;
@@ -163,7 +153,10 @@ static void context_free(struct raid_set *rs)
        for (i = 0; i < rs->md.raid_disks; i++) {
                if (rs->dev[i].meta_dev)
                        dm_put_device(rs->ti, rs->dev[i].meta_dev);
-                md_rdev_clear(&rs->dev[i].rdev);
+                if (rs->dev[i].rdev.sb_page)
+                        put_page(rs->dev[i].rdev.sb_page);
+                rs->dev[i].rdev.sb_page = NULL;
+                rs->dev[i].rdev.sb_loaded = 0;
                if (rs->dev[i].data_dev)
                        dm_put_device(rs->ti, rs->dev[i].data_dev);
        }
@@ -295,11 +288,9 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
                 * Choose a reasonable default.  All figures in sectors.
                 */
                if (min_region_size > (1 << 13)) {
-                        /* If not a power of 2, make it the next power of 2 */
-                        if (min_region_size & (min_region_size - 1))
-                                region_size = 1 << fls(region_size);
                        DMINFO("Choosing default region size of %lu sectors",
                               region_size);
+                        region_size = min_region_size;
                } else {
                        DMINFO("Choosing default region size of 4MiB");
                        region_size = 1 << 13; /* sectors */
@@ -340,84 +331,6 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 }
 /*
- * validate_rebuild_devices
- * @rs
- *
- * Determine if the devices specified for rebuild can result in a valid
- * usable array that is capable of rebuilding the given devices.
- *
- * Returns: 0 on success, -EINVAL on failure.
- */
-static int validate_rebuild_devices(struct raid_set *rs)
-{
-        unsigned i, rebuild_cnt = 0;
-        unsigned rebuilds_per_group, copies, d;
-        if (!(rs->print_flags & DMPF_REBUILD))
-                return 0;
-        for (i = 0; i < rs->md.raid_disks; i++)
-                if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
-                        rebuild_cnt++;
-        switch (rs->raid_type->level) {
-        case 1:
-                if (rebuild_cnt >= rs->md.raid_disks)
-                        goto too_many;
-                break;
-        case 4:
-        case 5:
-        case 6:
-                if (rebuild_cnt > rs->raid_type->parity_devs)
-                        goto too_many;
-                break;
-        case 10:
-                copies = raid10_md_layout_to_copies(rs->md.layout);
-                if (rebuild_cnt < copies)
-                        break;
-                /*
-                 * It is possible to have a higher rebuild count for RAID10,
-                 * as long as the failed devices occur in different mirror
-                 * groups (i.e. different stripes).
-                 *
-                 * Right now, we only allow for "near" copies.  When other
-                 * formats are added, we will have to check those too.
-                 *
-                 * When checking "near" format, make sure no adjacent devices
-                 * have failed beyond what can be handled.  In addition to the
-                 * simple case where the number of devices is a multiple of the
-                 * number of copies, we must also handle cases where the number
-                 * of devices is not a multiple of the number of copies.
-                 * E.g.    dev1 dev2 dev3 dev4 dev5
-                 *          A    A    B    B    C
-                 *          C    D    D    E    E
-                 */
-                rebuilds_per_group = 0;
-                for (i = 0; i < rs->md.raid_disks * copies; i++) {
-                        d = i % rs->md.raid_disks;
-                        if (!test_bit(In_sync, &rs->dev[d].rdev.flags) &&
-                            (++rebuilds_per_group >= copies))
-                                goto too_many;
-                        if (!((i + 1) % copies))
-                                rebuilds_per_group = 0;
-                }
-                break;
-        default:
-                DMERR("The rebuild parameter is not supported for %s",
-                      rs->raid_type->name);
-                rs->ti->error = "Rebuild not supported for this RAID type";
-                return -EINVAL;
-        }
-        return 0;
-too_many:
-        rs->ti->error = "Too many rebuild devices specified";
-        return -EINVAL;
-}
-/*
 * Possible arguments are...
 *      <chunk_size> [optional_args]
 *
@@ -435,20 +348,12 @@ too_many:
 *    [max_write_behind <sectors>]      See '-write-behind=' (man mdadm)
 *    [stripe_cache <sectors>]          Stripe cache size for higher RAIDs
 *    [region_size <sectors>]           Defines granularity of bitmap
- *
- * RAID10-only options:
- *    [raid10_copies <# copies>]        Number of copies.  (Default: 2)
- *    [raid10_format <near>]            Layout algorithm.  (Default: near)
 */
 static int parse_raid_params(struct raid_set *rs, char **argv,
                             unsigned num_raid_params)
 {
-        char *raid10_format = "near";
+        unsigned i, rebuild_cnt = 0;
-        unsigned raid10_copies = 2;
-        unsigned i;
        unsigned long value, region_size = 0;
-        sector_t sectors_per_dev = rs->ti->len;
-        sector_t max_io_len;
        char *key;
        /*
@@ -518,30 +423,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                }
                key = argv[i++];
-                /* Parameters that take a string value are checked here. */
-                if (!strcasecmp(key, "raid10_format")) {
-                        if (rs->raid_type->level != 10) {
-                                rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
-                                return -EINVAL;
-                        }
-                        if (strcmp("near", argv[i])) {
-                                rs->ti->error = "Invalid 'raid10_format' value given";
-                                return -EINVAL;
-                        }
-                        raid10_format = argv[i];
-                        rs->print_flags |= DMPF_RAID10_FORMAT;
-                        continue;
-                }
                if (strict_strtoul(argv[i], 10, &value) < 0) {
                        rs->ti->error = "Bad numerical argument given in raid params";
                        return -EINVAL;
                }
-                /* Parameters that take a numeric value are checked here */
                if (!strcasecmp(key, "rebuild")) {
-                        if (value >= rs->md.raid_disks) {
+                        rebuild_cnt++;
+                        if (((rs->raid_type->level != 1) &&
+                             (rebuild_cnt > rs->raid_type->parity_devs)) ||
+                            ((rs->raid_type->level == 1) &&
+                             (rebuild_cnt > (rs->md.raid_disks - 1)))) {
+                                rs->ti->error = "Too many rebuild devices specified for given RAID type";
+                                return -EINVAL;
+                        }
+                        if (value > rs->md.raid_disks) {
                                rs->ti->error = "Invalid rebuild index given";
                                return -EINVAL;
                        }
@@ -591,8 +487,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                         */
                        value /= 2;
-                        if ((rs->raid_type->level != 5) &&
+                        if (rs->raid_type->level < 5) {
-                            (rs->raid_type->level != 6)) {
                                rs->ti->error = "Inappropriate argument: stripe_cache";
                                return -EINVAL;
                        }
@@ -617,14 +512,6 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                } else if (!strcasecmp(key, "region_size")) {
                        rs->print_flags |= DMPF_REGION_SIZE;
                        region_size = value;
-                } else if (!strcasecmp(key, "raid10_copies") &&
-                           (rs->raid_type->level == 10)) {
-                        if ((value < 2) || (value > 0xFF)) {
-                                rs->ti->error = "Bad value for 'raid10_copies'";
-                                return -EINVAL;
-                        }
-                        rs->print_flags |= DMPF_RAID10_COPIES;
-                        raid10_copies = value;
                } else {
                        DMERR("Unable to parse RAID parameter: %s", key);
                        rs->ti->error = "Unable to parse RAID parameters";
@@ -636,36 +523,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                return -EINVAL;
        if (rs->md.chunk_sectors)
-                max_io_len = rs->md.chunk_sectors;
+                rs->ti->split_io = rs->md.chunk_sectors;
        else
-                max_io_len = region_size;
+                rs->ti->split_io = region_size;
-        if (dm_set_target_max_io_len(rs->ti, max_io_len))
-                return -EINVAL;
-        if (rs->raid_type->level == 10) {
-                if (raid10_copies > rs->md.raid_disks) {
-                        rs->ti->error = "Not enough devices to satisfy specification";
-                        return -EINVAL;
-                }
-                /* (Len * #mirrors) / #devices */
-                sectors_per_dev = rs->ti->len * raid10_copies;
-                sector_div(sectors_per_dev, rs->md.raid_disks);
-                rs->md.layout = raid10_format_to_md_layout(raid10_format,
-                                                           raid10_copies);
-                rs->md.new_layout = rs->md.layout;
-        } else if ((rs->raid_type->level > 1) &&
-                   sector_div(sectors_per_dev,
-                              (rs->md.raid_disks - rs->raid_type->parity_devs))) {
-                rs->ti->error = "Target length not divisible by number of data devices";
-                return -EINVAL;
-        }
-        rs->md.dev_sectors = sectors_per_dev;
-        if (validate_rebuild_devices(rs))
+        if (rs->md.chunk_sectors)
-                return -EINVAL;
+                rs->ti->split_io = rs->md.chunk_sectors;
+        else
+                rs->ti->split_io = region_size;
        /* Assume there are no metadata devices until the drives are parsed */
        rs->md.persistent = 0;
@@ -688,9 +553,6 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
        if (rs->raid_type->level == 1)
                return md_raid1_congested(&rs->md, bits);
-        if (rs->raid_type->level == 10)
-                return md_raid10_congested(&rs->md, bits);
        return md_raid5_congested(&rs->md, bits);
 }
@@ -732,7 +594,7 @@ struct dm_raid_superblock {
                                /* Always set to 0 when writing. */
 } __packed;
-static int read_disk_sb(struct md_rdev *rdev, int size)
+static int read_disk_sb(mdk_rdev_t *rdev, int size)
 {
        BUG_ON(!rdev->sb_page);
@@ -740,9 +602,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
                return 0;
        if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
-                DMERR("Failed to read superblock of device at position %d",
+                DMERR("Failed to read device superblock");
-                      rdev->raid_disk);
-                md_error(rdev->mddev, rdev);
                return -EINVAL;
        }
@@ -751,20 +611,18 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
        return 0;
 }
-static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
+static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 {
-        int i;
+        mdk_rdev_t *r, *t;
        uint64_t failed_devices;
        struct dm_raid_superblock *sb;
-        struct raid_set *rs = container_of(mddev, struct raid_set, md);
        sb = page_address(rdev->sb_page);
        failed_devices = le64_to_cpu(sb->failed_devices);
-        for (i = 0; i < mddev->raid_disks; i++)
+        rdev_for_each(r, t, mddev)
-                if (!rs->dev[i].data_dev ||
+                if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
-                    test_bit(Faulty, &(rs->dev[i].rdev.flags)))
+                        failed_devices |= (1ULL << r->raid_disk);
-                        failed_devices |= (1ULL << i);
        memset(sb, 0, sizeof(*sb));
@@ -793,7 +651,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 *
 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
 */
-static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
+static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
 {
        int ret;
        struct dm_raid_superblock *sb;
@@ -808,14 +666,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
                return ret;
        sb = page_address(rdev->sb_page);
+        if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
-        /*
-         * Two cases that we want to write new superblocks and rebuild:
-         * 1) New device (no matching magic number)
-         * 2) Device specified for rebuild (!In_sync w/ offset == 0)
-         */
-        if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) ||
-            (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) {
                super_sync(rdev->mddev, rdev);
                set_bit(FirstUse, &rdev->flags);
@@ -838,7 +689,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
        return (events_sb > events_refsb) ? 1 : 0;
 }
-static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
+static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        int role;
        struct raid_set *rs = container_of(mddev, struct raid_set, md);
@@ -847,7 +698,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
        struct dm_raid_superblock *sb;
        uint32_t new_devs = 0;
        uint32_t rebuilds = 0;
-        struct md_rdev *r;
+        mdk_rdev_t *r, *t;
        struct dm_raid_superblock *sb2;
        sb = page_address(rdev->sb_page);
@@ -890,10 +741,13 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
         *    case the In_sync bit will /not/ be set and
         *    recovery_cp must be MaxSector.
         */
-        rdev_for_each(r, mddev) {
+        rdev_for_each(r, t, mddev) {
                if (!test_bit(In_sync, &r->flags)) {
-                        DMINFO("Device %d specified for rebuild: "
+                        if (!test_bit(FirstUse, &r->flags))
-                               "Clearing superblock", r->raid_disk);
+                                DMERR("Superblock area of "
+                                      "rebuild device %d should have been "
+                                      "cleared.", r->raid_disk);
+                        set_bit(FirstUse, &r->flags);
                        rebuilds++;
                } else if (test_bit(FirstUse, &r->flags))
                        new_devs++;
@@ -922,7 +776,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
         * Now we set the Faulty bit for those devices that are
         * recorded in the superblock as failed.
         */
-        rdev_for_each(r, mddev) {
+        rdev_for_each(r, t, mddev) {
                if (!r->sb_page)
                        continue;
                sb2 = page_address(r->sb_page);
@@ -955,7 +809,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
        return 0;
 }
-static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        struct dm_raid_superblock *sb = page_address(rdev->sb_page);
@@ -995,43 +849,11 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 {
        int ret;
-        unsigned redundancy = 0;
+        mdk_rdev_t *rdev, *freshest, *tmp;
-        struct raid_dev *dev;
+        mddev_t *mddev = &rs->md;
-        struct md_rdev *rdev, *tmp, *freshest;
-        struct mddev *mddev = &rs->md;
-        switch (rs->raid_type->level) {
-        case 1:
-                redundancy = rs->md.raid_disks - 1;
-                break;
-        case 4:
-        case 5:
-        case 6:
-                redundancy = rs->raid_type->parity_devs;
-                break;
-        case 10:
-                redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
-                break;
-        default:
-                ti->error = "Unknown RAID type";
-                return -EINVAL;
-        }
        freshest = NULL;
-        rdev_for_each_safe(rdev, tmp, mddev) {
+        rdev_for_each(rdev, tmp, mddev) {
-                /*
-                 * Skipping super_load due to DMPF_SYNC will cause
-                 * the array to undergo initialization again as
-                 * though it were new.  This is the intended effect
-                 * of the "sync" directive.
-                 *
-                 * When reshaping capability is added, we must ensure
-                 * that the "sync" directive is disallowed during the
-                 * reshape.
-                 */
-                if (rs->print_flags & DMPF_SYNC)
-                        continue;
                if (!rdev->meta_bdev)
                        continue;
@@ -1044,37 +866,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
                case 0:
                        break;
                default:
-                        dev = container_of(rdev, struct raid_dev, rdev);
-                        if (redundancy--) {
-                                if (dev->meta_dev)
-                                        dm_put_device(ti, dev->meta_dev);
-                                dev->meta_dev = NULL;
-                                rdev->meta_bdev = NULL;
-                                if (rdev->sb_page)
-                                        put_page(rdev->sb_page);
-                                rdev->sb_page = NULL;
-                                rdev->sb_loaded = 0;
-                                /*
-                                 * We might be able to salvage the data device
-                                 * even though the meta device has failed.  For
-                                 * now, we behave as though '- -' had been
-                                 * set for this device in the table.
-                                 */
-                                if (dev->data_dev)
-                                        dm_put_device(ti, dev->data_dev);
-                                dev->data_dev = NULL;
-                                rdev->bdev = NULL;
-                                list_del(&rdev->same_set);
-                                continue;
-                        }
                        ti->error = "Failed to load superblock";
                        return ret;
                }
@@ -1091,7 +882,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
        if (super_validate(mddev, freshest))
                return -EINVAL;
-        rdev_for_each(rdev, mddev)
+        rdev_for_each(rdev, tmp, mddev)
                if ((rdev != freshest) && super_validate(mddev, rdev))
                        return -EINVAL;
@@ -1178,7 +969,6 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
        INIT_WORK(&rs->md.event_work, do_table_event);
        ti->private = rs;
-        ti->num_flush_requests = 1;
        mutex_lock(&rs->md.reconfig_mutex);
        ret = md_run(&rs->md);
@@ -1190,19 +980,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
                goto bad;
        }
-        if (ti->len != rs->md.array_sectors) {
-                ti->error = "Array size does not match requested target length";
-                ret = -EINVAL;
-                goto size_mismatch;
-        }
        rs->callbacks.congested_fn = raid_is_congested;
        dm_table_add_target_callbacks(ti->table, &rs->callbacks);
        mddev_suspend(&rs->md);
        return 0;
-size_mismatch:
-        md_stop(&rs->md);
 bad:
        context_free(rs);
@@ -1218,10 +1001,10 @@ static void raid_dtr(struct dm_target *ti)
        context_free(rs);
 }
-static int raid_map(struct dm_target *ti, struct bio *bio)
+static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
 {
        struct raid_set *rs = ti->private;
-        struct mddev *mddev = &rs->md;
+        mddev_t *mddev = &rs->md;
        mddev->pers->make_request(mddev, bio);
@@ -1229,61 +1012,35 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
 }
 static int raid_status(struct dm_target *ti, status_type_t type,
-                       unsigned status_flags, char *result, unsigned maxlen)
+                       char *result, unsigned maxlen)
 {
        struct raid_set *rs = ti->private;
        unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
        unsigned sz = 0;
-        int i, array_in_sync = 0;
+        int i;
        sector_t sync;
        switch (type) {
        case STATUSTYPE_INFO:
                DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
+                for (i = 0; i < rs->md.raid_disks; i++) {
+                        if (test_bit(Faulty, &rs->dev[i].rdev.flags))
+                                DMEMIT("D");
+                        else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                DMEMIT("A");
+                        else
+                                DMEMIT("a");
+                }
                if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
                        sync = rs->md.curr_resync_completed;
                else
                        sync = rs->md.recovery_cp;
-                if (sync >= rs->md.resync_max_sectors) {
+                if (sync > rs->md.resync_max_sectors)
-                        array_in_sync = 1;
                        sync = rs->md.resync_max_sectors;
-                } else {
-                        /*
-                         * The array may be doing an initial sync, or it may
-                         * be rebuilding individual components.  If all the
-                         * devices are In_sync, then it is the array that is
-                         * being initialized.
-                         */
-                        for (i = 0; i < rs->md.raid_disks; i++)
-                                if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
-                                        array_in_sync = 1;
-                }
-                /*
-                 * Status characters:
-                 *  'D' = Dead/Failed device
-                 *  'a' = Alive but not in-sync
-                 *  'A' = Alive and in-sync
-                 */
-                for (i = 0; i < rs->md.raid_disks; i++) {
-                        if (test_bit(Faulty, &rs->dev[i].rdev.flags))
-                                DMEMIT("D");
-                        else if (!array_in_sync ||
-                                 !test_bit(In_sync, &rs->dev[i].rdev.flags))
-                                DMEMIT("a");
-                        else
-                                DMEMIT("A");
-                }
-                /*
-                 * In-sync ratio:
-                 *  The in-sync ratio shows the progress of:
-                 *   - Initializing the array
-                 *   - Rebuilding a subset of devices of the array
-                 *  The user can distinguish between the two by referring
-                 *  to the status characters.
-                 */
                DMEMIT(" %llu/%llu",
                       (unsigned long long) sync,
                       (unsigned long long) rs->md.resync_max_sectors);
@@ -1301,7 +1058,7 @@ static int raid_status(struct dm_target *ti, status_type_t type,
                                raid_param_cnt += 2;
                }
-                raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2);
+                raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
                if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
                        raid_param_cnt--;
@@ -1340,7 +1097,7 @@ static int raid_status(struct dm_target *ti, status_type_t type,
                               rs->md.bitmap_info.max_write_behind);
                if (rs->print_flags & DMPF_STRIPE_CACHE) {
-                        struct r5conf *conf = rs->md.private;
+                        raid5_conf_t *conf = rs->md.private;
                        /* convert from kiB to sectors */
                        DMEMIT(" stripe_cache %d",
@@ -1351,13 +1108,6 @@ static int raid_status(struct dm_target *ti, status_type_t type,
                        DMEMIT(" region_size %lu",
                               rs->md.bitmap_info.chunksize >> 9);
-                if (rs->print_flags & DMPF_RAID10_COPIES)
-                        DMEMIT(" raid10_copies %u",
-                               raid10_md_layout_to_copies(rs->md.layout));
-                if (rs->print_flags & DMPF_RAID10_FORMAT)
-                        DMEMIT(" raid10_format near");
                DMEMIT(" %d", rs->md.raid_disks);
                for (i = 0; i < rs->md.raid_disks; i++) {
                        if (rs->dev[i].meta_dev)
@@ -1396,7 +1146,7 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
        struct raid_set *rs = ti->private;
        unsigned chunk_size = rs->md.chunk_sectors << 9;
-        struct r5conf *conf = rs->md.private;
+        raid5_conf_t *conf = rs->md.private;
        blk_limits_io_min(limits, chunk_size);
        blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
@@ -1420,19 +1170,13 @@ static void raid_resume(struct dm_target *ti)
 {
        struct raid_set *rs = ti->private;
-        set_bit(MD_CHANGE_DEVS, &rs->md.flags);
+        bitmap_load(&rs->md);
-        if (!rs->bitmap_loaded) {
-                bitmap_load(&rs->md);
-                rs->bitmap_loaded = 1;
-        }
-        clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
        mddev_resume(&rs->md);
 }
 static struct target_type raid_target = {
        .name = "raid",
-        .version = {1, 4, 0},
+        .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr = raid_ctr,
        .dtr = raid_dtr,
@@ -1459,8 +1203,6 @@ module_init(dm_raid_init);
 module_exit(dm_raid_exit);
 MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
-MODULE_ALIAS("dm-raid1");
-MODULE_ALIAS("dm-raid10");
 MODULE_ALIAS("dm-raid4");
 MODULE_ALIAS("dm-raid5");
 MODULE_ALIAS("dm-raid6");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index fa519185ebb..9bfd057be68 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -61,6 +61,7 @@ struct mirror_set {
        struct dm_region_hash *rh;
        struct dm_kcopyd_client *kcopyd_client;
        struct dm_io_client *io_client;
+        mempool_t *read_record_pool;
        /* recovery */
        region_t nr_regions;
@@ -138,13 +139,14 @@ static void dispatch_bios(void *context, struct bio_list *bio_list)
                queue_bio(ms, bio, WRITE);
 }
-struct dm_raid1_bio_record {
+#define MIN_READ_RECORDS 20
+struct dm_raid1_read_record {
        struct mirror *m;
-        /* if details->bi_bdev == NULL, details were not saved */
        struct dm_bio_details details;
-        region_t write_region;
 };
+static struct kmem_cache *_dm_raid1_read_record_cache;
 /*
 * Every mirror should look like this one.
 */
@@ -874,9 +876,19 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
        atomic_set(&ms->suspend, 0);
        atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
+        ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS,
+                                                _dm_raid1_read_record_cache);
+        if (!ms->read_record_pool) {
+                ti->error = "Error creating mirror read_record_pool";
+                kfree(ms);
+                return NULL;
+        }
        ms->io_client = dm_io_client_create();
        if (IS_ERR(ms->io_client)) {
                ti->error = "Error creating dm_io client";
+                mempool_destroy(ms->read_record_pool);
                kfree(ms);
                return NULL;
        }
@@ -888,6 +900,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
        if (IS_ERR(ms->rh)) {
                ti->error = "Error creating dirty region hash";
                dm_io_client_destroy(ms->io_client);
+                mempool_destroy(ms->read_record_pool);
                kfree(ms);
                return NULL;
        }
@@ -903,6 +916,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
        dm_io_client_destroy(ms->io_client);
        dm_region_hash_destroy(ms->rh);
+        mempool_destroy(ms->read_record_pool);
        kfree(ms);
 }
@@ -910,9 +924,8 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
                      unsigned int mirror, char **argv)
 {
        unsigned long long offset;
-        char dummy;
-        if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) {
+        if (sscanf(argv[1], "%llu", &offset) != 1) {
                ti->error = "Invalid offset";
                return -EINVAL;
        }
@@ -940,14 +953,13 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
 {
        unsigned param_count;
        struct dm_dirty_log *dl;
-        char dummy;
        if (argc < 2) {
                ti->error = "Insufficient mirror log arguments";
                return NULL;
        }
-        if (sscanf(argv[1], "%u%c", &param_count, &dummy) != 1) {
+        if (sscanf(argv[1], "%u", &param_count) != 1) {
                ti->error = "Invalid mirror log argument count";
                return NULL;
        }
@@ -974,14 +986,13 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
 {
        unsigned num_features;
        struct dm_target *ti = ms->ti;
-        char dummy;
        *args_used = 0;
        if (!argc)
                return 0;
-        if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) {
+        if (sscanf(argv[0], "%u", &num_features) != 1) {
                ti->error = "Invalid number of features";
                return -EINVAL;
        }
@@ -1025,7 +1036,6 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        unsigned int nr_mirrors, m, args_used;
        struct mirror_set *ms;
        struct dm_dirty_log *dl;
-        char dummy;
        dl = create_dirty_log(ti, argc, argv, &args_used);
        if (!dl)
@@ -1034,7 +1044,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        argv += args_used;
        argc -= args_used;
-        if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 ||
+        if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
            nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) {
                ti->error = "Invalid number of mirrors";
                dm_dirty_log_destroy(dl);
@@ -1067,15 +1077,9 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
        ti->private = ms;
+        ti->split_io = dm_rh_get_region_size(ms->rh);
-        r = dm_set_target_max_io_len(ti, dm_rh_get_region_size(ms->rh));
-        if (r)
-                goto err_free_context;
        ti->num_flush_requests = 1;
        ti->num_discard_requests = 1;
-        ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record);
-        ti->discard_zeroes_data_unsupported = true;
        ms->kmirrord_wq = alloc_workqueue("kmirrord",
                                          WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
@@ -1133,7 +1137,7 @@ static void mirror_dtr(struct dm_target *ti)
        del_timer_sync(&ms->timer);
        flush_workqueue(ms->kmirrord_wq);
-        flush_work(&ms->trigger_event);
+        flush_work_sync(&ms->trigger_event);
        dm_kcopyd_client_destroy(ms->kcopyd_client);
        destroy_workqueue(ms->kmirrord_wq);
        free_context(ms, ti, ms->nr_mirrors);
@@ -1142,20 +1146,18 @@ static void mirror_dtr(struct dm_target *ti)
 /*
 * Mirror mapping function
 */
-static int mirror_map(struct dm_target *ti, struct bio *bio)
+static int mirror_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
 {
        int r, rw = bio_rw(bio);
        struct mirror *m;
        struct mirror_set *ms = ti->private;
+        struct dm_raid1_read_record *read_record = NULL;
        struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
-        struct dm_raid1_bio_record *bio_record =
-          dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
-        bio_record->details.bi_bdev = NULL;
        if (rw == WRITE) {
                /* Save region for mirror_end_io() handler */
-                bio_record->write_region = dm_rh_bio_to_region(ms->rh, bio);
+                map_context->ll = dm_rh_bio_to_region(ms->rh, bio);
                queue_bio(ms, bio, rw);
                return DM_MAPIO_SUBMITTED;
        }
@@ -1183,29 +1185,33 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
        if (unlikely(!m))
                return -EIO;
-        dm_bio_record(&bio_record->details, bio);
+        read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO);
-        bio_record->m = m;
+        if (likely(read_record)) {
+                dm_bio_record(&read_record->details, bio);
+                map_context->ptr = read_record;
+                read_record->m = m;
+        }
        map_bio(m, bio);
        return DM_MAPIO_REMAPPED;
 }
-static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int mirror_end_io(struct dm_target *ti, struct bio *bio,
+                         int error, union map_info *map_context)
 {
        int rw = bio_rw(bio);
        struct mirror_set *ms = (struct mirror_set *) ti->private;
        struct mirror *m = NULL;
        struct dm_bio_details *bd = NULL;
-        struct dm_raid1_bio_record *bio_record =
+        struct dm_raid1_read_record *read_record = map_context->ptr;
-          dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
        /*
         * We need to dec pending if this was a write.
         */
        if (rw == WRITE) {
-                if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)))
+                if (!(bio->bi_rw & REQ_FLUSH))
-                        dm_rh_dec(ms->rh, bio_record->write_region);
+                        dm_rh_dec(ms->rh, map_context->ll);
                return error;
        }
@@ -1216,7 +1222,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
                goto out;
        if (unlikely(error)) {
-                if (!bio_record->details.bi_bdev) {
+                if (!read_record) {
                        /*
                         * There wasn't enough memory to record necessary
                         * information for a retry or there was no other
@@ -1226,7 +1232,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
                        return -EIO;
                }
-                m = bio_record->m;
+                m = read_record->m;
                DMERR("Mirror read failed from %s. Trying alternative device.",
                      m->dev->name);
@@ -1238,18 +1244,22 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
                 * mirror.
                 */
                if (default_ok(m) || mirror_available(ms, bio)) {
-                        bd = &bio_record->details;
+                        bd = &read_record->details;
                        dm_bio_restore(bd, bio);
-                        bio_record->details.bi_bdev = NULL;
+                        mempool_free(read_record, ms->read_record_pool);
+                        map_context->ptr = NULL;
                        queue_bio(ms, bio, rw);
-                        return DM_ENDIO_INCOMPLETE;
+                        return 1;
                }
                DMERR("All replicated volumes dead, failing I/O");
        }
 out:
-        bio_record->details.bi_bdev = NULL;
+        if (read_record) {
+                mempool_free(read_record, ms->read_record_pool);
+                map_context->ptr = NULL;
+        }
        return error;
 }
@@ -1348,7 +1358,7 @@ static char device_status_char(struct mirror *m)
 static int mirror_status(struct dm_target *ti, status_type_t type,
-                         unsigned status_flags, char *result, unsigned maxlen)
+                         char *result, unsigned int maxlen)
 {
        unsigned int m, sz = 0;
        struct mirror_set *ms = (struct mirror_set *) ti->private;
@@ -1403,7 +1413,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 static struct target_type mirror_target = {
        .name    = "mirror",
-        .version = {1, 13, 1},
+        .version = {1, 12, 1},
        .module  = THIS_MODULE,
        .ctr     = mirror_ctr,
        .dtr     = mirror_dtr,
@@ -1420,6 +1430,13 @@ static int __init dm_mirror_init(void)
 {
        int r;
+        _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0);
+        if (!_dm_raid1_read_record_cache) {
+                DMERR("Can't allocate dm_raid1_read_record cache");
+                r = -ENOMEM;
+                goto bad_cache;
+        }
        r = dm_register_target(&mirror_target);
        if (r < 0) {
                DMERR("Failed to register mirror target");
@@ -1429,12 +1446,15 @@ static int __init dm_mirror_init(void)
        return 0;
 bad_target:
+        kmem_cache_destroy(_dm_raid1_read_record_cache);
+bad_cache:
        return r;
 }
 static void __exit dm_mirror_exit(void)
 {
        dm_unregister_target(&mirror_target);
+        kmem_cache_destroy(_dm_raid1_read_record_cache);
 }
 /* Module hooks */
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 69732e03eb3..7771ed21218 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -404,9 +404,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
                return;
        }
-        if (bio->bi_rw & REQ_DISCARD)
-                return;
        /* We must inform the log that the sync count has changed. */
        log->type->set_region_sync(log, region, 0);
@@ -527,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
        struct bio *bio;
        for (bio = bios->head; bio; bio = bio->bi_next) {
-                if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))
+                if (bio->bi_rw & REQ_FLUSH)
                        continue;
                rh_inc(rh, dm_rh_bio_to_region(rh, bio));
        }
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 6ab1192cdd5..24752f449be 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -14,7 +14,6 @@
 #include "dm-path-selector.h"
 #include <linux/slab.h>
-#include <linux/module.h>
 #define DM_MSG_PREFIX "multipath round-robin"
@@ -114,7 +113,6 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
        struct selector *s = (struct selector *) ps->context;
        struct path_info *pi;
        unsigned repeat_count = RR_MIN_IO;
-        char dummy;
        if (argc > 1) {
                *error = "round-robin ps: incorrect number of arguments";
@@ -122,7 +120,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
        }
        /* First path argument is number of I/Os before switching path */
-        if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
+        if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
                *error = "round-robin ps: invalid repeat count";
                return -EINVAL;
        }
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 9df8f6bd641..9c6c2e47ad6 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -12,7 +12,6 @@
 #include "dm-path-selector.h"
 #include <linux/slab.h>
-#include <linux/module.h>
 #define DM_MSG_PREFIX   "multipath service-time"
 #define ST_MIN_IO       1
@@ -110,7 +109,6 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
        struct path_info *pi;
        unsigned repeat_count = ST_MIN_IO;
        unsigned relative_throughput = 1;
-        char dummy;
        /*
         * Arguments: [<repeat_count> [<relative_throughput>]]
@@ -129,13 +127,13 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
                return -EINVAL;
        }
-        if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
+        if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
                *error = "service-time ps: invalid repeat count";
                return -EINVAL;
        }
        if ((argc == 2) &&
-            (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
+            (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
             relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
                *error = "service-time ps: invalid relative_throughput value";
                return -EINVAL;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 3ac415675b6..d1f1d701710 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -10,7 +10,6 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/vmalloc.h>
-#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/dm-io.h>
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index 1ce9a2586e4..a0898a66a2f 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -10,7 +10,6 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/vmalloc.h>
-#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/dm-io.h>
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 59fc18ae52c..6f758870fc1 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -79,6 +79,7 @@ struct dm_snapshot {
        /* Chunks with outstanding reads */
        spinlock_t tracked_chunk_lock;
+        mempool_t *tracked_chunk_pool;
        struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
        /* The on disk metadata handler */
@@ -190,38 +191,35 @@ struct dm_snap_tracked_chunk {
        chunk_t chunk;
 };
-static void init_tracked_chunk(struct bio *bio)
+static struct kmem_cache *tracked_chunk_cache;
-{
-        struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
-        INIT_HLIST_NODE(&c->node);
-}
-static bool is_bio_tracked(struct bio *bio)
-{
-        struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
-        return !hlist_unhashed(&c->node);
-}
-static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk)
+static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s,
+                                                 chunk_t chunk)
 {
-        struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
+        struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool,
+                                                        GFP_NOIO);
+        unsigned long flags;
        c->chunk = chunk;
-        spin_lock_irq(&s->tracked_chunk_lock);
+        spin_lock_irqsave(&s->tracked_chunk_lock, flags);
        hlist_add_head(&c->node,
                       &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
-        spin_unlock_irq(&s->tracked_chunk_lock);
+        spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
+        return c;
 }
-static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio)
+static void stop_tracking_chunk(struct dm_snapshot *s,
+                                struct dm_snap_tracked_chunk *c)
 {
-        struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
        unsigned long flags;
        spin_lock_irqsave(&s->tracked_chunk_lock, flags);
        hlist_del(&c->node);
        spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
+        mempool_free(c, s->tracked_chunk_pool);
 }
 static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
@@ -693,7 +691,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 * Return a minimum chunk size of all snapshots that have the specified origin.
 * Return zero if the origin has no snapshots.
 */
-static uint32_t __minimum_chunk_size(struct origin *o)
+static sector_t __minimum_chunk_size(struct origin *o)
 {
        struct dm_snapshot *snap;
        unsigned chunk_size = 0;
@@ -703,7 +701,7 @@ static uint32_t __minimum_chunk_size(struct origin *o)
                        chunk_size = min_not_zero(chunk_size,
                                                  snap->store->chunk_size);
-        return (uint32_t) chunk_size;
+        return chunk_size;
 }
 /*
@@ -1122,6 +1120,14 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad_pending_pool;
        }
+        s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
+                                                         tracked_chunk_cache);
+        if (!s->tracked_chunk_pool) {
+                ti->error = "Could not allocate tracked_chunk mempool for "
+                            "tracking reads";
+                goto bad_tracked_chunk_pool;
+        }
        for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
                INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
@@ -1129,7 +1135,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->private = s;
        ti->num_flush_requests = num_flush_requests;
-        ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk);
        /* Add snapshot to the list of snapshots for this origin */
        /* Exceptions aren't triggered till snapshot_resume() is called */
@@ -1167,10 +1172,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                ti->error = "Chunk size not set";
                goto bad_read_metadata;
        }
+        ti->split_io = s->store->chunk_size;
-        r = dm_set_target_max_io_len(ti, s->store->chunk_size);
-        if (r)
-                goto bad_read_metadata;
        return 0;
@@ -1178,6 +1180,9 @@ bad_read_metadata:
        unregister_snapshot(s);
 bad_load_and_register:
+        mempool_destroy(s->tracked_chunk_pool);
+bad_tracked_chunk_pool:
        mempool_destroy(s->pending_pool);
 bad_pending_pool:
@@ -1234,7 +1239,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src,
        snap_dest->store->snap = snap_dest;
        snap_src->store->snap = snap_src;
-        snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
+        snap_dest->ti->split_io = snap_dest->store->chunk_size;
        snap_dest->valid = snap_src->valid;
        /*
@@ -1282,6 +1287,8 @@ static void snapshot_dtr(struct dm_target *ti)
                BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
 #endif
+        mempool_destroy(s->tracked_chunk_pool);
        __free_exceptions(s);
        mempool_destroy(s->pending_pool);
@@ -1567,7 +1574,8 @@ static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
                                          s->store->chunk_mask);
 }
-static int snapshot_map(struct dm_target *ti, struct bio *bio)
+static int snapshot_map(struct dm_target *ti, struct bio *bio,
+                        union map_info *map_context)
 {
        struct dm_exception *e;
        struct dm_snapshot *s = ti->private;
@@ -1575,8 +1583,6 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
        chunk_t chunk;
        struct dm_snap_pending_exception *pe = NULL;
-        init_tracked_chunk(bio);
        if (bio->bi_rw & REQ_FLUSH) {
                bio->bi_bdev = s->cow->bdev;
                return DM_MAPIO_REMAPPED;
@@ -1661,7 +1667,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
                }
        } else {
                bio->bi_bdev = s->origin->bdev;
-                track_chunk(s, bio, chunk);
+                map_context->ptr = track_chunk(s, chunk);
        }
 out_unlock:
@@ -1682,20 +1688,20 @@ out:
 * If merging is currently taking place on the chunk in question, the
 * I/O is deferred by adding it to s->bios_queued_during_merge.
 */
-static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
+static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
+                              union map_info *map_context)
 {
        struct dm_exception *e;
        struct dm_snapshot *s = ti->private;
        int r = DM_MAPIO_REMAPPED;
        chunk_t chunk;
-        init_tracked_chunk(bio);
        if (bio->bi_rw & REQ_FLUSH) {
-                if (!dm_bio_get_target_request_nr(bio))
+                if (!map_context->target_request_nr)
                        bio->bi_bdev = s->origin->bdev;
                else
                        bio->bi_bdev = s->cow->bdev;
+                map_context->ptr = NULL;
                return DM_MAPIO_REMAPPED;
        }
@@ -1724,7 +1730,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
                remap_exception(s, e, bio, chunk);
                if (bio_rw(bio) == WRITE)
-                        track_chunk(s, bio, chunk);
+                        map_context->ptr = track_chunk(s, chunk);
                goto out_unlock;
        }
@@ -1742,12 +1748,14 @@ out_unlock:
        return r;
 }
-static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
+                           int error, union map_info *map_context)
 {
        struct dm_snapshot *s = ti->private;
+        struct dm_snap_tracked_chunk *c = map_context->ptr;
-        if (is_bio_tracked(bio))
+        if (c)
-                stop_tracking_chunk(s, bio);
+                stop_tracking_chunk(s, c);
        return 0;
 }
@@ -1809,9 +1817,9 @@ static void snapshot_resume(struct dm_target *ti)
        up_write(&s->lock);
 }
-static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
+static sector_t get_origin_minimum_chunksize(struct block_device *bdev)
 {
-        uint32_t min_chunksize;
+        sector_t min_chunksize;
        down_read(&_origins_lock);
        min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
@@ -1830,15 +1838,15 @@ static void snapshot_merge_resume(struct dm_target *ti)
        snapshot_resume(ti);
        /*
-         * snapshot-merge acts as an origin, so set ti->max_io_len
+         * snapshot-merge acts as an origin, so set ti->split_io
         */
-        ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev);
+        ti->split_io = get_origin_minimum_chunksize(s->origin->bdev);
        start_merge(s);
 }
 static int snapshot_status(struct dm_target *ti, status_type_t type,
-                           unsigned status_flags, char *result, unsigned maxlen)
+                           char *result, unsigned int maxlen)
 {
        unsigned sz = 0;
        struct dm_snapshot *snap = ti->private;
@@ -2065,12 +2073,12 @@ static int origin_write_extent(struct dm_snapshot *merging_snap,
        struct origin *o;
        /*
-         * The origin's __minimum_chunk_size() got stored in max_io_len
+         * The origin's __minimum_chunk_size() got stored in split_io
         * by snapshot_merge_resume().
         */
        down_read(&_origins_lock);
        o = __lookup_origin(merging_snap->origin->bdev);
-        for (n = 0; n < size; n += merging_snap->ti->max_io_len)
+        for (n = 0; n < size; n += merging_snap->ti->split_io)
                if (__origin_write(&o->snapshots, sector + n, NULL) ==
                    DM_MAPIO_SUBMITTED)
                        must_wait = 1;
@@ -2116,7 +2124,8 @@ static void origin_dtr(struct dm_target *ti)
        dm_put_device(ti, dev);
 }
-static int origin_map(struct dm_target *ti, struct bio *bio)
+static int origin_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
 {
        struct dm_dev *dev = ti->private;
        bio->bi_bdev = dev->bdev;
@@ -2129,18 +2138,18 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
 }
 /*
- * Set the target "max_io_len" field to the minimum of all the snapshots'
+ * Set the target "split_io" field to the minimum of all the snapshots'
 * chunk sizes.
 */
 static void origin_resume(struct dm_target *ti)
 {
        struct dm_dev *dev = ti->private;
-        ti->max_io_len = get_origin_minimum_chunksize(dev->bdev);
+        ti->split_io = get_origin_minimum_chunksize(dev->bdev);
 }
-static int origin_status(struct dm_target *ti, status_type_t type,
+static int origin_status(struct dm_target *ti, status_type_t type, char *result,
-                         unsigned status_flags, char *result, unsigned maxlen)
+                         unsigned int maxlen)
 {
        struct dm_dev *dev = ti->private;
@@ -2167,6 +2176,7 @@ static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
                return max_size;
        bvm->bi_bdev = dev->bdev;
+        bvm->bi_sector = bvm->bi_sector;
        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
@@ -2181,7 +2191,7 @@ static int origin_iterate_devices(struct dm_target *ti,
 static struct target_type origin_target = {
        .name    = "snapshot-origin",
-        .version = {1, 8, 0},
+        .version = {1, 7, 1},
        .module  = THIS_MODULE,
        .ctr     = origin_ctr,
        .dtr     = origin_dtr,
@@ -2194,7 +2204,7 @@ static struct target_type origin_target = {
 static struct target_type snapshot_target = {
        .name    = "snapshot",
-        .version = {1, 11, 0},
+        .version = {1, 10, 0},
        .module  = THIS_MODULE,
        .ctr     = snapshot_ctr,
        .dtr     = snapshot_dtr,
@@ -2208,7 +2218,7 @@ static struct target_type snapshot_target = {
 static struct target_type merge_target = {
        .name    = dm_snapshot_merge_target_name,
-        .version = {1, 2, 0},
+        .version = {1, 1, 0},
        .module  = THIS_MODULE,
        .ctr     = snapshot_ctr,
        .dtr     = snapshot_dtr,
@@ -2269,8 +2279,17 @@ static int __init dm_snapshot_init(void)
                goto bad_pending_cache;
        }
+        tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
+        if (!tracked_chunk_cache) {
+                DMERR("Couldn't create cache to track chunks in use.");
+                r = -ENOMEM;
+                goto bad_tracked_chunk_cache;
+        }
        return 0;
+bad_tracked_chunk_cache:
+        kmem_cache_destroy(pending_cache);
 bad_pending_cache:
        kmem_cache_destroy(exception_cache);
 bad_exception_cache:
@@ -2296,6 +2315,7 @@ static void __exit dm_snapshot_exit(void)
        exit_origin_hash();
        kmem_cache_destroy(pending_cache);
        kmem_cache_destroy(exception_cache);
+        kmem_cache_destroy(tracked_chunk_cache);
        dm_exception_store_exit();
 }
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c89cde86d40..3d80cf0c152 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -26,12 +26,14 @@ struct stripe {
 struct stripe_c {
        uint32_t stripes;
        int stripes_shift;
+        sector_t stripes_mask;
        /* The size of this target / num. stripes */
        sector_t stripe_width;
-        uint32_t chunk_size;
+        /* stripe chunk size */
-        int chunk_size_shift;
+        uint32_t chunk_shift;
+        sector_t chunk_mask;
        /* Needed for handling events */
        struct dm_target *ti;
@@ -73,9 +75,8 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
                      unsigned int stripe, char **argv)
 {
        unsigned long long start;
-        char dummy;
-        if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1)
+        if (sscanf(argv[1], "%llu", &start) != 1)
                return -EINVAL;
        if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
@@ -89,7 +90,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
 /*
 * Construct a striped mapping.
- * <number of stripes> <chunk size> [<dev_path> <offset>]+
+ * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
 */
 static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
@@ -97,6 +98,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        sector_t width;
        uint32_t stripes;
        uint32_t chunk_size;
+        char *end;
        int r;
        unsigned int i;
@@ -105,23 +107,34 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                return -EINVAL;
        }
-        if (kstrtouint(argv[0], 10, &stripes) || !stripes) {
+        stripes = simple_strtoul(argv[0], &end, 10);
+        if (!stripes || *end) {
                ti->error = "Invalid stripe count";
                return -EINVAL;
        }
-        if (kstrtouint(argv[1], 10, &chunk_size) || !chunk_size) {
+        chunk_size = simple_strtoul(argv[1], &end, 10);
+        if (*end) {
                ti->error = "Invalid chunk_size";
                return -EINVAL;
        }
-        width = ti->len;
+        /*
-        if (sector_div(width, chunk_size)) {
+         * chunk_size is a power of two
+         */
+        if (!is_power_of_2(chunk_size) ||
+            (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
+                ti->error = "Invalid chunk size";
+                return -EINVAL;
+        }
+        if (ti->len & (chunk_size - 1)) {
                ti->error = "Target length not divisible by "
                    "chunk size";
                return -EINVAL;
        }
+        width = ti->len;
        if (sector_div(width, stripes)) {
                ti->error = "Target length not divisible by "
                    "number of stripes";
@@ -153,22 +166,17 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        if (stripes & (stripes - 1))
                sc->stripes_shift = -1;
-        else
+        else {
-                sc->stripes_shift = __ffs(stripes);
+                sc->stripes_shift = ffs(stripes) - 1;
+                sc->stripes_mask = ((sector_t) stripes) - 1;
-        r = dm_set_target_max_io_len(ti, chunk_size);
+        }
-        if (r)
-                return r;
+        ti->split_io = chunk_size;
        ti->num_flush_requests = stripes;
        ti->num_discard_requests = stripes;
-        ti->num_write_same_requests = stripes;
-        sc->chunk_size = chunk_size;
+        sc->chunk_shift = ffs(chunk_size) - 1;
-        if (chunk_size & (chunk_size - 1))
+        sc->chunk_mask = ((sector_t) chunk_size) - 1;
-                sc->chunk_size_shift = -1;
-        else
-                sc->chunk_size_shift = __ffs(chunk_size);
        /*
         * Get the stripe destinations.
@@ -200,36 +208,24 @@ static void stripe_dtr(struct dm_target *ti)
        for (i = 0; i < sc->stripes; i++)
                dm_put_device(ti, sc->stripe[i].dev);
-        flush_work(&sc->trigger_event);
+        flush_work_sync(&sc->trigger_event);
        kfree(sc);
 }
 static void stripe_map_sector(struct stripe_c *sc, sector_t sector,
                              uint32_t *stripe, sector_t *result)
 {
-        sector_t chunk = dm_target_offset(sc->ti, sector);
+        sector_t offset = dm_target_offset(sc->ti, sector);
-        sector_t chunk_offset;
+        sector_t chunk = offset >> sc->chunk_shift;
-        if (sc->chunk_size_shift < 0)
-                chunk_offset = sector_div(chunk, sc->chunk_size);
-        else {
-                chunk_offset = chunk & (sc->chunk_size - 1);
-                chunk >>= sc->chunk_size_shift;
-        }
        if (sc->stripes_shift < 0)
                *stripe = sector_div(chunk, sc->stripes);
        else {
-                *stripe = chunk & (sc->stripes - 1);
+                *stripe = chunk & sc->stripes_mask;
                chunk >>= sc->stripes_shift;
        }
-        if (sc->chunk_size_shift < 0)
+        *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask);
-                chunk *= sc->chunk_size;
-        else
-                chunk <<= sc->chunk_size_shift;
-        *result = chunk + chunk_offset;
 }
 static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
@@ -240,20 +236,13 @@ static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
        stripe_map_sector(sc, sector, &stripe, result);
        if (stripe == target_stripe)
                return;
+        *result &= ~sc->chunk_mask;                     /* round down */
-        /* round down */
-        sector = *result;
-        if (sc->chunk_size_shift < 0)
-                *result -= sector_div(sector, sc->chunk_size);
-        else
-                *result = sector & ~(sector_t)(sc->chunk_size - 1);
        if (target_stripe < stripe)
-                *result += sc->chunk_size;              /* next chunk */
+                *result += sc->chunk_mask + 1;          /* next chunk */
 }
-static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
+static int stripe_map_discard(struct stripe_c *sc, struct bio *bio,
-                            uint32_t target_stripe)
+                              uint32_t target_stripe)
 {
        sector_t begin, end;
@@ -272,23 +261,23 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
        }
 }
-static int stripe_map(struct dm_target *ti, struct bio *bio)
+static int stripe_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
 {
        struct stripe_c *sc = ti->private;
        uint32_t stripe;
        unsigned target_request_nr;
        if (bio->bi_rw & REQ_FLUSH) {
-                target_request_nr = dm_bio_get_target_request_nr(bio);
+                target_request_nr = map_context->target_request_nr;
                BUG_ON(target_request_nr >= sc->stripes);
                bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
                return DM_MAPIO_REMAPPED;
        }
-        if (unlikely(bio->bi_rw & REQ_DISCARD) ||
+        if (unlikely(bio->bi_rw & REQ_DISCARD)) {
-            unlikely(bio->bi_rw & REQ_WRITE_SAME)) {
+                target_request_nr = map_context->target_request_nr;
-                target_request_nr = dm_bio_get_target_request_nr(bio);
                BUG_ON(target_request_nr >= sc->stripes);
-                return stripe_map_range(sc, bio, target_request_nr);
+                return stripe_map_discard(sc, bio, target_request_nr);
        }
        stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector);
@@ -312,8 +301,8 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 *
 */
-static int stripe_status(struct dm_target *ti, status_type_t type,
+static int stripe_status(struct dm_target *ti,
-                         unsigned status_flags, char *result, unsigned maxlen)
+                         status_type_t type, char *result, unsigned int maxlen)
 {
        struct stripe_c *sc = (struct stripe_c *) ti->private;
        char buffer[sc->stripes + 1];
@@ -334,7 +323,7 @@ static int stripe_status(struct dm_target *ti, status_type_t type,
        case STATUSTYPE_TABLE:
                DMEMIT("%d %llu", sc->stripes,
-                        (unsigned long long)sc->chunk_size);
+                        (unsigned long long)sc->chunk_mask + 1);
                for (i = 0; i < sc->stripes; i++)
                        DMEMIT(" %s %llu", sc->stripe[i].dev->name,
                            (unsigned long long)sc->stripe[i].physical_start);
@@ -343,7 +332,8 @@ static int stripe_status(struct dm_target *ti, status_type_t type,
        return 0;
 }
-static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int stripe_end_io(struct dm_target *ti, struct bio *bio,
+                         int error, union map_info *map_context)
 {
        unsigned i;
        char major_minor[16];
@@ -400,7 +390,7 @@ static void stripe_io_hints(struct dm_target *ti,
                            struct queue_limits *limits)
 {
        struct stripe_c *sc = ti->private;
-        unsigned chunk_size = sc->chunk_size << SECTOR_SHIFT;
+        unsigned chunk_size = (sc->chunk_mask + 1) << 9;
        blk_limits_io_min(limits, chunk_size);
        blk_limits_io_opt(limits, chunk_size * sc->stripes);
@@ -428,7 +418,7 @@ static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 static struct target_type stripe_target = {
        .name   = "striped",
-        .version = {1, 5, 0},
+        .version = {1, 4, 0},
        .module = THIS_MODULE,
        .ctr    = stripe_ctr,
        .dtr    = stripe_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index daf25d0890b..bc04518e9d8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -54,9 +54,7 @@ struct dm_table {
        sector_t *highs;
        struct dm_target *targets;
-        struct target_type *immutable_target_type;
        unsigned integrity_supported:1;
-        unsigned singleton:1;
        /*
         * Indicates the rw permissions for the new logical
@@ -268,7 +266,8 @@ void dm_table_destroy(struct dm_table *t)
        vfree(t->highs);
        /* free the device list */
-        free_devices(&t->devices);
+        if (t->devices.next != &t->devices)
+                free_devices(&t->devices);
        dm_free_md_mempools(t->mempools);
@@ -463,11 +462,10 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
        struct dm_dev_internal *dd;
        unsigned int major, minor;
        struct dm_table *t = ti->table;
-        char dummy;
        BUG_ON(!t);
-        if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
+        if (sscanf(path, "%u:%u", &major, &minor) == 2) {
                /* Extract the major/minor numbers */
                dev = MKDEV(major, minor);
                if (MAJOR(dev) != major || MINOR(dev) != minor)
@@ -699,7 +697,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
        while (i < dm_table_get_num_targets(table)) {
                ti = dm_table_get_target(table, i++);
-                blk_set_stacking_limits(&ti_limits);
+                blk_set_default_limits(&ti_limits);
                /* combine all target devices' limits */
                if (ti->type->iterate_devices)
@@ -742,12 +740,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
        char **argv;
        struct dm_target *tgt;
-        if (t->singleton) {
-                DMERR("%s: target type %s must appear alone in table",
-                      dm_device_name(t->md), t->targets->type->name);
-                return -EINVAL;
-        }
        if ((r = check_space(t)))
                return r;
@@ -766,36 +758,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
                return -EINVAL;
        }
-        if (dm_target_needs_singleton(tgt->type)) {
-                if (t->num_targets) {
-                        DMERR("%s: target type %s must appear alone in table",
-                              dm_device_name(t->md), type);
-                        return -EINVAL;
-                }
-                t->singleton = 1;
-        }
-        if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) {
-                DMERR("%s: target type %s may not be included in read-only tables",
-                      dm_device_name(t->md), type);
-                return -EINVAL;
-        }
-        if (t->immutable_target_type) {
-                if (t->immutable_target_type != tgt->type) {
-                        DMERR("%s: immutable target type %s cannot be mixed with other target types",
-                              dm_device_name(t->md), t->immutable_target_type->name);
-                        return -EINVAL;
-                }
-        } else if (dm_target_is_immutable(tgt->type)) {
-                if (t->num_targets) {
-                        DMERR("%s: immutable target type %s cannot be mixed with other target types",
-                              dm_device_name(t->md), tgt->type->name);
-                        return -EINVAL;
-                }
-                t->immutable_target_type = tgt->type;
-        }
        tgt->table = t;
        tgt->begin = start;
        tgt->len = len;
@@ -842,10 +804,9 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
                             unsigned *value, char **error, unsigned grouped)
 {
        const char *arg_str = dm_shift_arg(arg_set);
-        char dummy;
        if (!arg_str ||
-            (sscanf(arg_str, "%u%c", value, &dummy) != 1) ||
+            (sscanf(arg_str, "%u", value) != 1) ||
            (*value < arg->min) ||
            (*value > arg->max) ||
            (grouped && arg_set->argc < *value)) {
@@ -954,11 +915,6 @@ unsigned dm_table_get_type(struct dm_table *t)
        return t->type;
 }
-struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
-{
-        return t->immutable_target_type;
-}
 bool dm_table_request_based(struct dm_table *t)
 {
        return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
@@ -967,22 +923,13 @@ bool dm_table_request_based(struct dm_table *t)
 int dm_table_alloc_md_mempools(struct dm_table *t)
 {
        unsigned type = dm_table_get_type(t);
-        unsigned per_bio_data_size = 0;
-        struct dm_target *tgt;
-        unsigned i;
        if (unlikely(type == DM_TYPE_NONE)) {
                DMWARN("no table type is set, can't allocate mempools");
                return -EINVAL;
        }
-        if (type == DM_TYPE_BIO_BASED)
+        t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
-                for (i = 0; i < t->num_targets; i++) {
-                        tgt = t->targets + i;
-                        per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
-                }
-        t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size);
        if (!t->mempools)
                return -ENOMEM;
@@ -1221,41 +1168,6 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
        return &t->targets[(KEYS_PER_NODE * n) + k];
 }
-static int count_device(struct dm_target *ti, struct dm_dev *dev,
-                        sector_t start, sector_t len, void *data)
-{
-        unsigned *num_devices = data;
-        (*num_devices)++;
-        return 0;
-}
-/*
- * Check whether a table has no data devices attached using each
- * target's iterate_devices method.
- * Returns false if the result is unknown because a target doesn't
- * support iterate_devices.
- */
-bool dm_table_has_no_data_devices(struct dm_table *table)
-{
-        struct dm_target *uninitialized_var(ti);
-        unsigned i = 0, num_devices = 0;
-        while (i < dm_table_get_num_targets(table)) {
-                ti = dm_table_get_target(table, i++);
-                if (!ti->type->iterate_devices)
-                        return false;
-                ti->type->iterate_devices(ti, count_device, &num_devices);
-                if (num_devices)
-                        return false;
-        }
-        return true;
-}
 /*
 * Establish the new table's queue_limits and validate them.
 */
@@ -1266,10 +1178,10 @@ int dm_calculate_queue_limits(struct dm_table *table,
        struct queue_limits ti_limits;
        unsigned i = 0;
-        blk_set_stacking_limits(limits);
+        blk_set_default_limits(limits);
        while (i < dm_table_get_num_targets(table)) {
-                blk_set_stacking_limits(&ti_limits);
+                blk_set_default_limits(&ti_limits);
                ti = dm_table_get_target(table, i++);
@@ -1363,9 +1275,6 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
                if (!ti->num_flush_requests)
                        continue;
-                if (ti->flush_supported)
-                        return 1;
                if (ti->type->iterate_devices &&
                    ti->type->iterate_devices(ti, device_flush_capable, &flush))
                        return 1;
@@ -1390,66 +1299,6 @@ static bool dm_table_discard_zeroes_data(struct dm_table *t)
        return 1;
 }
-static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
-                            sector_t start, sector_t len, void *data)
-{
-        struct request_queue *q = bdev_get_queue(dev->bdev);
-        return q && blk_queue_nonrot(q);
-}
-static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
-                             sector_t start, sector_t len, void *data)
-{
-        struct request_queue *q = bdev_get_queue(dev->bdev);
-        return q && !blk_queue_add_random(q);
-}
-static bool dm_table_all_devices_attribute(struct dm_table *t,
-                                           iterate_devices_callout_fn func)
-{
-        struct dm_target *ti;
-        unsigned i = 0;
-        while (i < dm_table_get_num_targets(t)) {
-                ti = dm_table_get_target(t, i++);
-                if (!ti->type->iterate_devices ||
-                    !ti->type->iterate_devices(ti, func, NULL))
-                        return 0;
-        }
-        return 1;
-}
-static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
-                                         sector_t start, sector_t len, void *data)
-{
-        struct request_queue *q = bdev_get_queue(dev->bdev);
-        return q && !q->limits.max_write_same_sectors;
-}
-static bool dm_table_supports_write_same(struct dm_table *t)
-{
-        struct dm_target *ti;
-        unsigned i = 0;
-        while (i < dm_table_get_num_targets(t)) {
-                ti = dm_table_get_target(t, i++);
-                if (!ti->num_write_same_requests)
-                        return false;
-                if (!ti->type->iterate_devices ||
-                    !ti->type->iterate_devices(ti, device_not_write_same_capable, NULL))
-                        return false;
-        }
-        return true;
-}
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
                               struct queue_limits *limits)
 {
@@ -1475,27 +1324,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
        if (!dm_table_discard_zeroes_data(t))
                q->limits.discard_zeroes_data = 0;
-        /* Ensure that all underlying devices are non-rotational. */
-        if (dm_table_all_devices_attribute(t, device_is_nonrot))
-                queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
-        else
-                queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
-        if (!dm_table_supports_write_same(t))
-                q->limits.max_write_same_sectors = 0;
        dm_table_set_integrity(t);
        /*
-         * Determine whether or not this queue's I/O timings contribute
-         * to the entropy pool, Only request-based targets use this.
-         * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
-         * have it set.
-         */
-        if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
-                queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
-        /*
         * QUEUE_FLAG_STACKABLE must be set after all queue settings are
         * visible to other CPUs because, once the flag is set, incoming bios
         * are processed by request-based dm, which refers to the queue
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 617d21a7725..8da366cf381 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -126,14 +126,15 @@ static void io_err_dtr(struct dm_target *tt)
        /* empty */
 }
-static int io_err_map(struct dm_target *tt, struct bio *bio)
+static int io_err_map(struct dm_target *tt, struct bio *bio,
+                      union map_info *map_context)
 {
        return -EIO;
 }
 static struct target_type error_target = {
        .name = "error",
-        .version = {1, 1, 0},
+        .version = {1, 0, 1},
        .ctr  = io_err_ctr,
        .dtr  = io_err_dtr,
        .map  = io_err_map,
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
deleted file mode 100644
index 4d6e85367b8..00000000000
--- a/drivers/md/dm-thin-metadata.c
+++ /dev/null
@@ -1,1686 +0,0 @@
-/*
- * Copyright (C) 2011-2012 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#include "dm-thin-metadata.h"
-#include "persistent-data/dm-btree.h"
-#include "persistent-data/dm-space-map.h"
-#include "persistent-data/dm-space-map-disk.h"
-#include "persistent-data/dm-transaction-manager.h"
-#include <linux/list.h>
-#include <linux/device-mapper.h>
-#include <linux/workqueue.h>
-/*--------------------------------------------------------------------------
- * As far as the metadata goes, there is:
- *
- * - A superblock in block zero, taking up fewer than 512 bytes for
- *   atomic writes.
- *
- * - A space map managing the metadata blocks.
- *
- * - A space map managing the data blocks.
- *
- * - A btree mapping our internal thin dev ids onto struct disk_device_details.
- *
- * - A hierarchical btree, with 2 levels which effectively maps (thin
- *   dev id, virtual block) -> block_time.  Block time is a 64-bit
- *   field holding the time in the low 24 bits, and block in the top 48
- *   bits.
- *
- * BTrees consist solely of btree_nodes, that fill a block.  Some are
- * internal nodes, as such their values are a __le64 pointing to other
- * nodes.  Leaf nodes can store data of any reasonable size (ie. much
- * smaller than the block size).  The nodes consist of the header,
- * followed by an array of keys, followed by an array of values.  We have
- * to binary search on the keys so they're all held together to help the
- * cpu cache.
- *
- * Space maps have 2 btrees:
- *
- * - One maps a uint64_t onto a struct index_entry.  Which points to a
- *   bitmap block, and has some details about how many free entries there
- *   are etc.
- *
- * - The bitmap blocks have a header (for the checksum).  Then the rest
- *   of the block is pairs of bits.  With the meaning being:
- *
- *   0 - ref count is 0
- *   1 - ref count is 1
- *   2 - ref count is 2
- *   3 - ref count is higher than 2
- *
- * - If the count is higher than 2 then the ref count is entered in a
- *   second btree that directly maps the block_address to a uint32_t ref
- *   count.
- *
- * The space map metadata variant doesn't have a bitmaps btree.  Instead
- * it has one single blocks worth of index_entries.  This avoids
- * recursive issues with the bitmap btree needing to allocate space in
- * order to insert.  With a small data block size such as 64k the
- * metadata support data devices that are hundreds of terrabytes.
- *
- * The space maps allocate space linearly from front to back.  Space that
- * is freed in a transaction is never recycled within that transaction.
- * To try and avoid fragmenting _free_ space the allocator always goes
- * back and fills in gaps.
- *
- * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
- * from the block manager.
- *--------------------------------------------------------------------------*/
-#define DM_MSG_PREFIX   "thin metadata"
-#define THIN_SUPERBLOCK_MAGIC 27022010
-#define THIN_SUPERBLOCK_LOCATION 0
-#define THIN_VERSION 1
-#define THIN_METADATA_CACHE_SIZE 64
-#define SECTOR_TO_BLOCK_SHIFT 3
-/*
- *  3 for btree insert +
- *  2 for btree lookup used within space map
- */
-#define THIN_MAX_CONCURRENT_LOCKS 5
-/* This should be plenty */
-#define SPACE_MAP_ROOT_SIZE 128
-/*
- * Little endian on-disk superblock and device details.
- */
-struct thin_disk_superblock {
-        __le32 csum;    /* Checksum of superblock except for this field. */
-        __le32 flags;
-        __le64 blocknr; /* This block number, dm_block_t. */
-        __u8 uuid[16];
-        __le64 magic;
-        __le32 version;
-        __le32 time;
-        __le64 trans_id;
-        /*
-         * Root held by userspace transactions.
-         */
-        __le64 held_root;
-        __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
-        __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
-        /*
-         * 2-level btree mapping (dev_id, (dev block, time)) -> data block
-         */
-        __le64 data_mapping_root;
-        /*
-         * Device detail root mapping dev_id -> device_details
-         */
-        __le64 device_details_root;
-        __le32 data_block_size;         /* In 512-byte sectors. */
-        __le32 metadata_block_size;     /* In 512-byte sectors. */
-        __le64 metadata_nr_blocks;
-        __le32 compat_flags;
-        __le32 compat_ro_flags;
-        __le32 incompat_flags;
-} __packed;
-struct disk_device_details {
-        __le64 mapped_blocks;
-        __le64 transaction_id;          /* When created. */
-        __le32 creation_time;
-        __le32 snapshotted_time;
-} __packed;
-struct dm_pool_metadata {
-        struct hlist_node hash;
-        struct block_device *bdev;
-        struct dm_block_manager *bm;
-        struct dm_space_map *metadata_sm;
-        struct dm_space_map *data_sm;
-        struct dm_transaction_manager *tm;
-        struct dm_transaction_manager *nb_tm;
-        /*
-         * Two-level btree.
-         * First level holds thin_dev_t.
-         * Second level holds mappings.
-         */
-        struct dm_btree_info info;
-        /*
-         * Non-blocking version of the above.
-         */
-        struct dm_btree_info nb_info;
-        /*
-         * Just the top level for deleting whole devices.
-         */
-        struct dm_btree_info tl_info;
-        /*
-         * Just the bottom level for creating new devices.
-         */
-        struct dm_btree_info bl_info;
-        /*
-         * Describes the device details btree.
-         */
-        struct dm_btree_info details_info;
-        struct rw_semaphore root_lock;
-        uint32_t time;
-        dm_block_t root;
-        dm_block_t details_root;
-        struct list_head thin_devices;
-        uint64_t trans_id;
-        unsigned long flags;
-        sector_t data_block_size;
-        bool read_only:1;
-        /*
-         * Set if a transaction has to be aborted but the attempt to roll back
-         * to the previous (good) transaction failed.  The only pool metadata
-         * operation possible in this state is the closing of the device.
-         */
-        bool fail_io:1;
-};
-struct dm_thin_device {
-        struct list_head list;
-        struct dm_pool_metadata *pmd;
-        dm_thin_id id;
-        int open_count;
-        bool changed:1;
-        bool aborted_with_changes:1;
-        uint64_t mapped_blocks;
-        uint64_t transaction_id;
-        uint32_t creation_time;
-        uint32_t snapshotted_time;
-};
-/*----------------------------------------------------------------
- * superblock validator
- *--------------------------------------------------------------*/
-#define SUPERBLOCK_CSUM_XOR 160774
-static void sb_prepare_for_write(struct dm_block_validator *v,
-                                 struct dm_block *b,
-                                 size_t block_size)
-{
-        struct thin_disk_superblock *disk_super = dm_block_data(b);
-        disk_super->blocknr = cpu_to_le64(dm_block_location(b));
-        disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
-                                                      block_size - sizeof(__le32),
-                                                      SUPERBLOCK_CSUM_XOR));
-}
-static int sb_check(struct dm_block_validator *v,
-                    struct dm_block *b,
-                    size_t block_size)
-{
-        struct thin_disk_superblock *disk_super = dm_block_data(b);
-        __le32 csum_le;
-        if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
-                DMERR("sb_check failed: blocknr %llu: "
-                      "wanted %llu", le64_to_cpu(disk_super->blocknr),
-                      (unsigned long long)dm_block_location(b));
-                return -ENOTBLK;
-        }
-        if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
-                DMERR("sb_check failed: magic %llu: "
-                      "wanted %llu", le64_to_cpu(disk_super->magic),
-                      (unsigned long long)THIN_SUPERBLOCK_MAGIC);
-                return -EILSEQ;
-        }
-        csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
-                                             block_size - sizeof(__le32),
-                                             SUPERBLOCK_CSUM_XOR));
-        if (csum_le != disk_super->csum) {
-                DMERR("sb_check failed: csum %u: wanted %u",
-                      le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
-                return -EILSEQ;
-        }
-        return 0;
-}
-static struct dm_block_validator sb_validator = {
-        .name = "superblock",
-        .prepare_for_write = sb_prepare_for_write,
-        .check = sb_check
-};
-/*----------------------------------------------------------------
- * Methods for the btree value types
- *--------------------------------------------------------------*/
-static uint64_t pack_block_time(dm_block_t b, uint32_t t)
-{
-        return (b << 24) | t;
-}
-static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
-{
-        *b = v >> 24;
-        *t = v & ((1 << 24) - 1);
-}
-static void data_block_inc(void *context, void *value_le)
-{
-        struct dm_space_map *sm = context;
-        __le64 v_le;
-        uint64_t b;
-        uint32_t t;
-        memcpy(&v_le, value_le, sizeof(v_le));
-        unpack_block_time(le64_to_cpu(v_le), &b, &t);
-        dm_sm_inc_block(sm, b);
-}
-static void data_block_dec(void *context, void *value_le)
-{
-        struct dm_space_map *sm = context;
-        __le64 v_le;
-        uint64_t b;
-        uint32_t t;
-        memcpy(&v_le, value_le, sizeof(v_le));
-        unpack_block_time(le64_to_cpu(v_le), &b, &t);
-        dm_sm_dec_block(sm, b);
-}
-static int data_block_equal(void *context, void *value1_le, void *value2_le)
-{
-        __le64 v1_le, v2_le;
-        uint64_t b1, b2;
-        uint32_t t;
-        memcpy(&v1_le, value1_le, sizeof(v1_le));
-        memcpy(&v2_le, value2_le, sizeof(v2_le));
-        unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
-        unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
-        return b1 == b2;
-}
-static void subtree_inc(void *context, void *value)
-{
-        struct dm_btree_info *info = context;
-        __le64 root_le;
-        uint64_t root;
-        memcpy(&root_le, value, sizeof(root_le));
-        root = le64_to_cpu(root_le);
-        dm_tm_inc(info->tm, root);
-}
-static void subtree_dec(void *context, void *value)
-{
-        struct dm_btree_info *info = context;
-        __le64 root_le;
-        uint64_t root;
-        memcpy(&root_le, value, sizeof(root_le));
-        root = le64_to_cpu(root_le);
-        if (dm_btree_del(info, root))
-                DMERR("btree delete failed\n");
-}
-static int subtree_equal(void *context, void *value1_le, void *value2_le)
-{
-        __le64 v1_le, v2_le;
-        memcpy(&v1_le, value1_le, sizeof(v1_le));
-        memcpy(&v2_le, value2_le, sizeof(v2_le));
-        return v1_le == v2_le;
-}
-/*----------------------------------------------------------------*/
-static int superblock_lock_zero(struct dm_pool_metadata *pmd,
-                                struct dm_block **sblock)
-{
-        return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
-                                     &sb_validator, sblock);
-}
-static int superblock_lock(struct dm_pool_metadata *pmd,
-                           struct dm_block **sblock)
-{
-        return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
-                                &sb_validator, sblock);
-}
-static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
-{
-        int r;
-        unsigned i;
-        struct dm_block *b;
-        __le64 *data_le, zero = cpu_to_le64(0);
-        unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
-        /*
-         * We can't use a validator here - it may be all zeroes.
-         */
-        r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
-        if (r)
-                return r;
-        data_le = dm_block_data(b);
-        *result = 1;
-        for (i = 0; i < block_size; i++) {
-                if (data_le[i] != zero) {
-                        *result = 0;
-                        break;
-                }
-        }
-        return dm_bm_unlock(b);
-}
-static void __setup_btree_details(struct dm_pool_metadata *pmd)
-{
-        pmd->info.tm = pmd->tm;
-        pmd->info.levels = 2;
-        pmd->info.value_type.context = pmd->data_sm;
-        pmd->info.value_type.size = sizeof(__le64);
-        pmd->info.value_type.inc = data_block_inc;
-        pmd->info.value_type.dec = data_block_dec;
-        pmd->info.value_type.equal = data_block_equal;
-        memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
-        pmd->nb_info.tm = pmd->nb_tm;
-        pmd->tl_info.tm = pmd->tm;
-        pmd->tl_info.levels = 1;
-        pmd->tl_info.value_type.context = &pmd->bl_info;
-        pmd->tl_info.value_type.size = sizeof(__le64);
-        pmd->tl_info.value_type.inc = subtree_inc;
-        pmd->tl_info.value_type.dec = subtree_dec;
-        pmd->tl_info.value_type.equal = subtree_equal;
-        pmd->bl_info.tm = pmd->tm;
-        pmd->bl_info.levels = 1;
-        pmd->bl_info.value_type.context = pmd->data_sm;
-        pmd->bl_info.value_type.size = sizeof(__le64);
-        pmd->bl_info.value_type.inc = data_block_inc;
-        pmd->bl_info.value_type.dec = data_block_dec;
-        pmd->bl_info.value_type.equal = data_block_equal;
-        pmd->details_info.tm = pmd->tm;
-        pmd->details_info.levels = 1;
-        pmd->details_info.value_type.context = NULL;
-        pmd->details_info.value_type.size = sizeof(struct disk_device_details);
-        pmd->details_info.value_type.inc = NULL;
-        pmd->details_info.value_type.dec = NULL;
-        pmd->details_info.value_type.equal = NULL;
-}
-static int __write_initial_superblock(struct dm_pool_metadata *pmd)
-{
-        int r;
-        struct dm_block *sblock;
-        size_t metadata_len, data_len;
-        struct thin_disk_superblock *disk_super;
-        sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
-        if (bdev_size > THIN_METADATA_MAX_SECTORS)
-                bdev_size = THIN_METADATA_MAX_SECTORS;
-        r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
-        if (r < 0)
-                return r;
-        r = dm_sm_root_size(pmd->data_sm, &data_len);
-        if (r < 0)
-                return r;
-        r = dm_sm_commit(pmd->data_sm);
-        if (r < 0)
-                return r;
-        r = dm_tm_pre_commit(pmd->tm);
-        if (r < 0)
-                return r;
-        r = superblock_lock_zero(pmd, &sblock);
-        if (r)
-                return r;
-        disk_super = dm_block_data(sblock);
-        disk_super->flags = 0;
-        memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
-        disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
-        disk_super->version = cpu_to_le32(THIN_VERSION);
-        disk_super->time = 0;
-        disk_super->trans_id = 0;
-        disk_super->held_root = 0;
-        r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
-                            metadata_len);
-        if (r < 0)
-                goto bad_locked;
-        r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
-                            data_len);
-        if (r < 0)
-                goto bad_locked;
-        disk_super->data_mapping_root = cpu_to_le64(pmd->root);
-        disk_super->device_details_root = cpu_to_le64(pmd->details_root);
-        disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
-        disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
-        disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
-        return dm_tm_commit(pmd->tm, sblock);
-bad_locked:
-        dm_bm_unlock(sblock);
-        return r;
-}
-static int __format_metadata(struct dm_pool_metadata *pmd)
-{
-        int r;
-        r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
-                                 &pmd->tm, &pmd->metadata_sm);
-        if (r < 0) {
-                DMERR("tm_create_with_sm failed");
-                return r;
-        }
-        pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
-        if (IS_ERR(pmd->data_sm)) {
-                DMERR("sm_disk_create failed");
-                r = PTR_ERR(pmd->data_sm);
-                goto bad_cleanup_tm;
-        }
-        pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
-        if (!pmd->nb_tm) {
-                DMERR("could not create non-blocking clone tm");
-                r = -ENOMEM;
-                goto bad_cleanup_data_sm;
-        }
-        __setup_btree_details(pmd);
-        r = dm_btree_empty(&pmd->info, &pmd->root);
-        if (r < 0)
-                goto bad_cleanup_nb_tm;
-        r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
-        if (r < 0) {
-                DMERR("couldn't create devices root");
-                goto bad_cleanup_nb_tm;
-        }
-        r = __write_initial_superblock(pmd);
-        if (r)
-                goto bad_cleanup_nb_tm;
-        return 0;
-bad_cleanup_nb_tm:
-        dm_tm_destroy(pmd->nb_tm);
-bad_cleanup_data_sm:
-        dm_sm_destroy(pmd->data_sm);
-bad_cleanup_tm:
-        dm_tm_destroy(pmd->tm);
-        dm_sm_destroy(pmd->metadata_sm);
-        return r;
-}
-static int __check_incompat_features(struct thin_disk_superblock *disk_super,
-                                     struct dm_pool_metadata *pmd)
-{
-        uint32_t features;
-        features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
-        if (features) {
-                DMERR("could not access metadata due to unsupported optional features (%lx).",
-                      (unsigned long)features);
-                return -EINVAL;
-        }
-        /*
-         * Check for read-only metadata to skip the following RDWR checks.
-         */
-        if (get_disk_ro(pmd->bdev->bd_disk))
-                return 0;
-        features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
-        if (features) {
-                DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
-                      (unsigned long)features);
-                return -EINVAL;
-        }
-        return 0;
-}
-static int __open_metadata(struct dm_pool_metadata *pmd)
-{
-        int r;
-        struct dm_block *sblock;
-        struct thin_disk_superblock *disk_super;
-        r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
-                            &sb_validator, &sblock);
-        if (r < 0) {
-                DMERR("couldn't read superblock");
-                return r;
-        }
-        disk_super = dm_block_data(sblock);
-        r = __check_incompat_features(disk_super, pmd);
-        if (r < 0)
-                goto bad_unlock_sblock;
-        r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
-                               disk_super->metadata_space_map_root,
-                               sizeof(disk_super->metadata_space_map_root),
-                               &pmd->tm, &pmd->metadata_sm);
-        if (r < 0) {
-                DMERR("tm_open_with_sm failed");
-                goto bad_unlock_sblock;
-        }
-        pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
-                                       sizeof(disk_super->data_space_map_root));
-        if (IS_ERR(pmd->data_sm)) {
-                DMERR("sm_disk_open failed");
-                r = PTR_ERR(pmd->data_sm);
-                goto bad_cleanup_tm;
-        }
-        pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
-        if (!pmd->nb_tm) {
-                DMERR("could not create non-blocking clone tm");
-                r = -ENOMEM;
-                goto bad_cleanup_data_sm;
-        }
-        __setup_btree_details(pmd);
-        return dm_bm_unlock(sblock);
-bad_cleanup_data_sm:
-        dm_sm_destroy(pmd->data_sm);
-bad_cleanup_tm:
-        dm_tm_destroy(pmd->tm);
-        dm_sm_destroy(pmd->metadata_sm);
-bad_unlock_sblock:
-        dm_bm_unlock(sblock);
-        return r;
-}
-static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
-{
-        int r, unformatted;
-        r = __superblock_all_zeroes(pmd->bm, &unformatted);
-        if (r)
-                return r;
-        if (unformatted)
-                return format_device ? __format_metadata(pmd) : -EPERM;
-        return __open_metadata(pmd);
-}
-static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
-{
-        int r;
-        pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE,
-                                          THIN_METADATA_CACHE_SIZE,
-                                          THIN_MAX_CONCURRENT_LOCKS);
-        if (IS_ERR(pmd->bm)) {
-                DMERR("could not create block manager");
-                return PTR_ERR(pmd->bm);
-        }
-        r = __open_or_format_metadata(pmd, format_device);
-        if (r)
-                dm_block_manager_destroy(pmd->bm);
-        return r;
-}
-static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
-{
-        dm_sm_destroy(pmd->data_sm);
-        dm_sm_destroy(pmd->metadata_sm);
-        dm_tm_destroy(pmd->nb_tm);
-        dm_tm_destroy(pmd->tm);
-        dm_block_manager_destroy(pmd->bm);
-}
-static int __begin_transaction(struct dm_pool_metadata *pmd)
-{
-        int r;
-        struct thin_disk_superblock *disk_super;
-        struct dm_block *sblock;
-        /*
-         * We re-read the superblock every time.  Shouldn't need to do this
-         * really.
-         */
-        r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
-                            &sb_validator, &sblock);
-        if (r)
-                return r;
-        disk_super = dm_block_data(sblock);
-        pmd->time = le32_to_cpu(disk_super->time);
-        pmd->root = le64_to_cpu(disk_super->data_mapping_root);
-        pmd->details_root = le64_to_cpu(disk_super->device_details_root);
-        pmd->trans_id = le64_to_cpu(disk_super->trans_id);
-        pmd->flags = le32_to_cpu(disk_super->flags);
-        pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
-        dm_bm_unlock(sblock);
-        return 0;
-}
-static int __write_changed_details(struct dm_pool_metadata *pmd)
-{
-        int r;
-        struct dm_thin_device *td, *tmp;
-        struct disk_device_details details;
-        uint64_t key;
-        list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
-                if (!td->changed)
-                        continue;
-                key = td->id;
-                details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
-                details.transaction_id = cpu_to_le64(td->transaction_id);
-                details.creation_time = cpu_to_le32(td->creation_time);
-                details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
-                __dm_bless_for_disk(&details);
-                r = dm_btree_insert(&pmd->details_info, pmd->details_root,
-                                    &key, &details, &pmd->details_root);
-                if (r)
-                        return r;
-                if (td->open_count)
-                        td->changed = 0;
-                else {
-                        list_del(&td->list);
-                        kfree(td);
-                }
-        }
-        return 0;
-}
-static int __commit_transaction(struct dm_pool_metadata *pmd)
-{
-        int r;
-        size_t metadata_len, data_len;
-        struct thin_disk_superblock *disk_super;
-        struct dm_block *sblock;
-        /*
-         * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
-         */
-        BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
-        r = __write_changed_details(pmd);
-        if (r < 0)
-                return r;
-        r = dm_sm_commit(pmd->data_sm);
-        if (r < 0)
-                return r;
-        r = dm_tm_pre_commit(pmd->tm);
-        if (r < 0)
-                return r;
-        r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
-        if (r < 0)
-                return r;
-        r = dm_sm_root_size(pmd->data_sm, &data_len);
-        if (r < 0)
-                return r;
-        r = superblock_lock(pmd, &sblock);
-        if (r)
-                return r;
-        disk_super = dm_block_data(sblock);
-        disk_super->time = cpu_to_le32(pmd->time);
-        disk_super->data_mapping_root = cpu_to_le64(pmd->root);
-        disk_super->device_details_root = cpu_to_le64(pmd->details_root);
-        disk_super->trans_id = cpu_to_le64(pmd->trans_id);
-        disk_super->flags = cpu_to_le32(pmd->flags);
-        r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
-                            metadata_len);
-        if (r < 0)
-                goto out_locked;
-        r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
-                            data_len);
-        if (r < 0)
-                goto out_locked;
-        return dm_tm_commit(pmd->tm, sblock);
-out_locked:
-        dm_bm_unlock(sblock);
-        return r;
-}
-struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
-                                               sector_t data_block_size,
-                                               bool format_device)
-{
-        int r;
-        struct dm_pool_metadata *pmd;
-        pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
-        if (!pmd) {
-                DMERR("could not allocate metadata struct");
-                return ERR_PTR(-ENOMEM);
-        }
-        init_rwsem(&pmd->root_lock);
-        pmd->time = 0;
-        INIT_LIST_HEAD(&pmd->thin_devices);
-        pmd->read_only = false;
-        pmd->fail_io = false;
-        pmd->bdev = bdev;
-        pmd->data_block_size = data_block_size;
-        r = __create_persistent_data_objects(pmd, format_device);
-        if (r) {
-                kfree(pmd);
-                return ERR_PTR(r);
-        }
-        r = __begin_transaction(pmd);
-        if (r < 0) {
-                if (dm_pool_metadata_close(pmd) < 0)
-                        DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
-                return ERR_PTR(r);
-        }
-        return pmd;
-}
-int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
-{
-        int r;
-        unsigned open_devices = 0;
-        struct dm_thin_device *td, *tmp;
-        down_read(&pmd->root_lock);
-        list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
-                if (td->open_count)
-                        open_devices++;
-                else {
-                        list_del(&td->list);
-                        kfree(td);
-                }
-        }
-        up_read(&pmd->root_lock);
-        if (open_devices) {
-                DMERR("attempt to close pmd when %u device(s) are still open",
-                       open_devices);
-                return -EBUSY;
-        }
-        if (!pmd->read_only && !pmd->fail_io) {
-                r = __commit_transaction(pmd);
-                if (r < 0)
-                        DMWARN("%s: __commit_transaction() failed, error = %d",
-                               __func__, r);
-        }
-        if (!pmd->fail_io)
-                __destroy_persistent_data_objects(pmd);
-        kfree(pmd);
-        return 0;
-}
-/*
- * __open_device: Returns @td corresponding to device with id @dev,
- * creating it if @create is set and incrementing @td->open_count.
- * On failure, @td is undefined.
- */
-static int __open_device(struct dm_pool_metadata *pmd,
-                         dm_thin_id dev, int create,
-                         struct dm_thin_device **td)
-{
-        int r, changed = 0;
-        struct dm_thin_device *td2;
-        uint64_t key = dev;
-        struct disk_device_details details_le;
-        /*
-         * If the device is already open, return it.
-         */
-        list_for_each_entry(td2, &pmd->thin_devices, list)
-                if (td2->id == dev) {
-                        /*
-                         * May not create an already-open device.
-                         */
-                        if (create)
-                                return -EEXIST;
-                        td2->open_count++;
-                        *td = td2;
-                        return 0;
-                }
-        /*
-         * Check the device exists.
-         */
-        r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
-                            &key, &details_le);
-        if (r) {
-                if (r != -ENODATA || !create)
-                        return r;
-                /*
-                 * Create new device.
-                 */
-                changed = 1;
-                details_le.mapped_blocks = 0;
-                details_le.transaction_id = cpu_to_le64(pmd->trans_id);
-                details_le.creation_time = cpu_to_le32(pmd->time);
-                details_le.snapshotted_time = cpu_to_le32(pmd->time);
-        }
-        *td = kmalloc(sizeof(**td), GFP_NOIO);
-        if (!*td)
-                return -ENOMEM;
-        (*td)->pmd = pmd;
-        (*td)->id = dev;
-        (*td)->open_count = 1;
-        (*td)->changed = changed;
-        (*td)->aborted_with_changes = false;
-        (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
-        (*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
-        (*td)->creation_time = le32_to_cpu(details_le.creation_time);
-        (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
-        list_add(&(*td)->list, &pmd->thin_devices);
-        return 0;
-}
-static void __close_device(struct dm_thin_device *td)
-{
-        --td->open_count;
-}
-static int __create_thin(struct dm_pool_metadata *pmd,
-                         dm_thin_id dev)
-{
-        int r;
-        dm_block_t dev_root;
-        uint64_t key = dev;
-        struct disk_device_details details_le;
-        struct dm_thin_device *td;
-        __le64 value;
-        r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
-                            &key, &details_le);
-        if (!r)
-                return -EEXIST;
-        /*
-         * Create an empty btree for the mappings.
-         */
-        r = dm_btree_empty(&pmd->bl_info, &dev_root);
-        if (r)
-                return r;
-        /*
-         * Insert it into the main mapping tree.
-         */
-        value = cpu_to_le64(dev_root);
-        __dm_bless_for_disk(&value);
-        r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
-        if (r) {
-                dm_btree_del(&pmd->bl_info, dev_root);
-                return r;
-        }
-        r = __open_device(pmd, dev, 1, &td);
-        if (r) {
-                dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
-                dm_btree_del(&pmd->bl_info, dev_root);
-                return r;
-        }
-        __close_device(td);
-        return r;
-}
-int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
-{
-        int r = -EINVAL;
-        down_write(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = __create_thin(pmd, dev);
-        up_write(&pmd->root_lock);
-        return r;
-}
-static int __set_snapshot_details(struct dm_pool_metadata *pmd,
-                                  struct dm_thin_device *snap,
-                                  dm_thin_id origin, uint32_t time)
-{
-        int r;
-        struct dm_thin_device *td;
-        r = __open_device(pmd, origin, 0, &td);
-        if (r)
-                return r;
-        td->changed = 1;
-        td->snapshotted_time = time;
-        snap->mapped_blocks = td->mapped_blocks;
-        snap->snapshotted_time = time;
-        __close_device(td);
-        return 0;
-}
-static int __create_snap(struct dm_pool_metadata *pmd,
-                         dm_thin_id dev, dm_thin_id origin)
-{
-        int r;
-        dm_block_t origin_root;
-        uint64_t key = origin, dev_key = dev;
-        struct dm_thin_device *td;
-        struct disk_device_details details_le;
-        __le64 value;
-        /* check this device is unused */
-        r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
-                            &dev_key, &details_le);
-        if (!r)
-                return -EEXIST;
-        /* find the mapping tree for the origin */
-        r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
-        if (r)
-                return r;
-        origin_root = le64_to_cpu(value);
-        /* clone the origin, an inc will do */
-        dm_tm_inc(pmd->tm, origin_root);
-        /* insert into the main mapping tree */
-        value = cpu_to_le64(origin_root);
-        __dm_bless_for_disk(&value);
-        key = dev;
-        r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
-        if (r) {
-                dm_tm_dec(pmd->tm, origin_root);
-                return r;
-        }
-        pmd->time++;
-        r = __open_device(pmd, dev, 1, &td);
-        if (r)
-                goto bad;
-        r = __set_snapshot_details(pmd, td, origin, pmd->time);
-        __close_device(td);
-        if (r)
-                goto bad;
-        return 0;
-bad:
-        dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
-        dm_btree_remove(&pmd->details_info, pmd->details_root,
-                        &key, &pmd->details_root);
-        return r;
-}
-int dm_pool_create_snap(struct dm_pool_metadata *pmd,
-                                 dm_thin_id dev,
-                                 dm_thin_id origin)
-{
-        int r = -EINVAL;
-        down_write(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = __create_snap(pmd, dev, origin);
-        up_write(&pmd->root_lock);
-        return r;
-}
-static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
-{
-        int r;
-        uint64_t key = dev;
-        struct dm_thin_device *td;
-        /* TODO: failure should mark the transaction invalid */
-        r = __open_device(pmd, dev, 0, &td);
-        if (r)
-                return r;
-        if (td->open_count > 1) {
-                __close_device(td);
-                return -EBUSY;
-        }
-        list_del(&td->list);
-        kfree(td);
-        r = dm_btree_remove(&pmd->details_info, pmd->details_root,
-                            &key, &pmd->details_root);
-        if (r)
-                return r;
-        r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
-        if (r)
-                return r;
-        return 0;
-}
-int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
-                               dm_thin_id dev)
-{
-        int r = -EINVAL;
-        down_write(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = __delete_device(pmd, dev);
-        up_write(&pmd->root_lock);
-        return r;
-}
-int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
-                                        uint64_t current_id,
-                                        uint64_t new_id)
-{
-        int r = -EINVAL;
-        down_write(&pmd->root_lock);
-        if (pmd->fail_io)
-                goto out;
-        if (pmd->trans_id != current_id) {
-                DMERR("mismatched transaction id");
-                goto out;
-        }
-        pmd->trans_id = new_id;
-        r = 0;
-out:
-        up_write(&pmd->root_lock);
-        return r;
-}
-int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
-                                        uint64_t *result)
-{
-        int r = -EINVAL;
-        down_read(&pmd->root_lock);
-        if (!pmd->fail_io) {
-                *result = pmd->trans_id;
-                r = 0;
-        }
-        up_read(&pmd->root_lock);
-        return r;
-}
-static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
-{
-        int r, inc;
-        struct thin_disk_superblock *disk_super;
-        struct dm_block *copy, *sblock;
-        dm_block_t held_root;
-        /*
-         * Copy the superblock.
-         */
-        dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
-        r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
-                               &sb_validator, &copy, &inc);
-        if (r)
-                return r;
-        BUG_ON(!inc);
-        held_root = dm_block_location(copy);
-        disk_super = dm_block_data(copy);
-        if (le64_to_cpu(disk_super->held_root)) {
-                DMWARN("Pool metadata snapshot already exists: release this before taking another.");
-                dm_tm_dec(pmd->tm, held_root);
-                dm_tm_unlock(pmd->tm, copy);
-                return -EBUSY;
-        }
-        /*
-         * Wipe the spacemap since we're not publishing this.
-         */
-        memset(&disk_super->data_space_map_root, 0,
-               sizeof(disk_super->data_space_map_root));
-        memset(&disk_super->metadata_space_map_root, 0,
-               sizeof(disk_super->metadata_space_map_root));
-        /*
-         * Increment the data structures that need to be preserved.
-         */
-        dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
-        dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
-        dm_tm_unlock(pmd->tm, copy);
-        /*
-         * Write the held root into the superblock.
-         */
-        r = superblock_lock(pmd, &sblock);
-        if (r) {
-                dm_tm_dec(pmd->tm, held_root);
-                return r;
-        }
-        disk_super = dm_block_data(sblock);
-        disk_super->held_root = cpu_to_le64(held_root);
-        dm_bm_unlock(sblock);
-        return 0;
-}
-int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
-{
-        int r = -EINVAL;
-        down_write(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = __reserve_metadata_snap(pmd);
-        up_write(&pmd->root_lock);
-        return r;
-}
-static int __release_metadata_snap(struct dm_pool_metadata *pmd)
-{
-        int r;
-        struct thin_disk_superblock *disk_super;
-        struct dm_block *sblock, *copy;
-        dm_block_t held_root;
-        r = superblock_lock(pmd, &sblock);
-        if (r)
-                return r;
-        disk_super = dm_block_data(sblock);
-        held_root = le64_to_cpu(disk_super->held_root);
-        disk_super->held_root = cpu_to_le64(0);
-        dm_bm_unlock(sblock);
-        if (!held_root) {
-                DMWARN("No pool metadata snapshot found: nothing to release.");
-                return -EINVAL;
-        }
-        r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
-        if (r)
-                return r;
-        disk_super = dm_block_data(copy);
-        dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root));
-        dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root));
-        dm_sm_dec_block(pmd->metadata_sm, held_root);
-        return dm_tm_unlock(pmd->tm, copy);
-}
-int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
-{
-        int r = -EINVAL;
-        down_write(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = __release_metadata_snap(pmd);
-        up_write(&pmd->root_lock);
-        return r;
-}
-static int __get_metadata_snap(struct dm_pool_metadata *pmd,
-                               dm_block_t *result)
-{
-        int r;
-        struct thin_disk_superblock *disk_super;
-        struct dm_block *sblock;
-        r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
-                            &sb_validator, &sblock);
-        if (r)
-                return r;
-        disk_super = dm_block_data(sblock);
-        *result = le64_to_cpu(disk_super->held_root);
-        return dm_bm_unlock(sblock);
-}
-int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
-                              dm_block_t *result)
-{
-        int r = -EINVAL;
-        down_read(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = __get_metadata_snap(pmd, result);
-        up_read(&pmd->root_lock);
-        return r;
-}
-int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
-                             struct dm_thin_device **td)
-{
-        int r = -EINVAL;
-        down_write(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = __open_device(pmd, dev, 0, td);
-        up_write(&pmd->root_lock);
-        return r;
-}
-int dm_pool_close_thin_device(struct dm_thin_device *td)
-{
-        down_write(&td->pmd->root_lock);
-        __close_device(td);
-        up_write(&td->pmd->root_lock);
-        return 0;
-}
-dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
-{
-        return td->id;
-}
-static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
-{
-        return td->snapshotted_time > time;
-}
-int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
-                       int can_block, struct dm_thin_lookup_result *result)
-{
-        int r = -EINVAL;
-        uint64_t block_time = 0;
-        __le64 value;
-        struct dm_pool_metadata *pmd = td->pmd;
-        dm_block_t keys[2] = { td->id, block };
-        struct dm_btree_info *info;
-        if (can_block) {
-                down_read(&pmd->root_lock);
-                info = &pmd->info;
-        } else if (down_read_trylock(&pmd->root_lock))
-                info = &pmd->nb_info;
-        else
-                return -EWOULDBLOCK;
-        if (pmd->fail_io)
-                goto out;
-        r = dm_btree_lookup(info, pmd->root, keys, &value);
-        if (!r)
-                block_time = le64_to_cpu(value);
-out:
-        up_read(&pmd->root_lock);
-        if (!r) {
-                dm_block_t exception_block;
-                uint32_t exception_time;
-                unpack_block_time(block_time, &exception_block,
-                                  &exception_time);
-                result->block = exception_block;
-                result->shared = __snapshotted_since(td, exception_time);
-        }
-        return r;
-}
-static int __insert(struct dm_thin_device *td, dm_block_t block,
-                    dm_block_t data_block)
-{
-        int r, inserted;
-        __le64 value;
-        struct dm_pool_metadata *pmd = td->pmd;
-        dm_block_t keys[2] = { td->id, block };
-        value = cpu_to_le64(pack_block_time(data_block, pmd->time));
-        __dm_bless_for_disk(&value);
-        r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
-                                   &pmd->root, &inserted);
-        if (r)
-                return r;
-        td->changed = 1;
-        if (inserted)
-                td->mapped_blocks++;
-        return 0;
-}
-int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
-                         dm_block_t data_block)
-{
-        int r = -EINVAL;
-        down_write(&td->pmd->root_lock);
-        if (!td->pmd->fail_io)
-                r = __insert(td, block, data_block);
-        up_write(&td->pmd->root_lock);
-        return r;
-}
-static int __remove(struct dm_thin_device *td, dm_block_t block)
-{
-        int r;
-        struct dm_pool_metadata *pmd = td->pmd;
-        dm_block_t keys[2] = { td->id, block };
-        r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root);
-        if (r)
-                return r;
-        td->mapped_blocks--;
-        td->changed = 1;
-        return 0;
-}
-int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
-{
-        int r = -EINVAL;
-        down_write(&td->pmd->root_lock);
-        if (!td->pmd->fail_io)
-                r = __remove(td, block);
-        up_write(&td->pmd->root_lock);
-        return r;
-}
-bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
-{
-        int r;
-        down_read(&td->pmd->root_lock);
-        r = td->changed;
-        up_read(&td->pmd->root_lock);
-        return r;
-}
-bool dm_thin_aborted_changes(struct dm_thin_device *td)
-{
-        bool r;
-        down_read(&td->pmd->root_lock);
-        r = td->aborted_with_changes;
-        up_read(&td->pmd->root_lock);
-        return r;
-}
-int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
-{
-        int r = -EINVAL;
-        down_write(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = dm_sm_new_block(pmd->data_sm, result);
-        up_write(&pmd->root_lock);
-        return r;
-}
-int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
-{
-        int r = -EINVAL;
-        down_write(&pmd->root_lock);
-        if (pmd->fail_io)
-                goto out;
-        r = __commit_transaction(pmd);
-        if (r <= 0)
-                goto out;
-        /*
-         * Open the next transaction.
-         */
-        r = __begin_transaction(pmd);
-out:
-        up_write(&pmd->root_lock);
-        return r;
-}
-static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
-{
-        struct dm_thin_device *td;
-        list_for_each_entry(td, &pmd->thin_devices, list)
-                td->aborted_with_changes = td->changed;
-}
-int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
-{
-        int r = -EINVAL;
-        down_write(&pmd->root_lock);
-        if (pmd->fail_io)
-                goto out;
-        __set_abort_with_changes_flags(pmd);
-        __destroy_persistent_data_objects(pmd);
-        r = __create_persistent_data_objects(pmd, false);
-        if (r)
-                pmd->fail_io = true;
-out:
-        up_write(&pmd->root_lock);
-        return r;
-}
-int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
-{
-        int r = -EINVAL;
-        down_read(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = dm_sm_get_nr_free(pmd->data_sm, result);
-        up_read(&pmd->root_lock);
-        return r;
-}
-int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
-                                          dm_block_t *result)
-{
-        int r = -EINVAL;
-        down_read(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = dm_sm_get_nr_free(pmd->metadata_sm, result);
-        up_read(&pmd->root_lock);
-        return r;
-}
-int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
-                                  dm_block_t *result)
-{
-        int r = -EINVAL;
-        down_read(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
-        up_read(&pmd->root_lock);
-        return r;
-}
-int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
-{
-        down_read(&pmd->root_lock);
-        *result = pmd->data_block_size;
-        up_read(&pmd->root_lock);
-        return 0;
-}
-int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
-{
-        int r = -EINVAL;
-        down_read(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = dm_sm_get_nr_blocks(pmd->data_sm, result);
-        up_read(&pmd->root_lock);
-        return r;
-}
-int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
-{
-        int r = -EINVAL;
-        struct dm_pool_metadata *pmd = td->pmd;
-        down_read(&pmd->root_lock);
-        if (!pmd->fail_io) {
-                *result = td->mapped_blocks;
-                r = 0;
-        }
-        up_read(&pmd->root_lock);
-        return r;
-}
-static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
-{
-        int r;
-        __le64 value_le;
-        dm_block_t thin_root;
-        struct dm_pool_metadata *pmd = td->pmd;
-        r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
-        if (r)
-                return r;
-        thin_root = le64_to_cpu(value_le);
-        return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
-}
-int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
-                                     dm_block_t *result)
-{
-        int r = -EINVAL;
-        struct dm_pool_metadata *pmd = td->pmd;
-        down_read(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = __highest_block(td, result);
-        up_read(&pmd->root_lock);
-        return r;
-}
-static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
-{
-        int r;
-        dm_block_t old_count;
-        r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count);
-        if (r)
-                return r;
-        if (new_count == old_count)
-                return 0;
-        if (new_count < old_count) {
-                DMERR("cannot reduce size of data device");
-                return -EINVAL;
-        }
-        return dm_sm_extend(pmd->data_sm, new_count - old_count);
-}
-int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
-{
-        int r = -EINVAL;
-        down_write(&pmd->root_lock);
-        if (!pmd->fail_io)
-                r = __resize_data_dev(pmd, new_count);
-        up_write(&pmd->root_lock);
-        return r;
-}
-void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
-{
-        down_write(&pmd->root_lock);
-        pmd->read_only = true;
-        dm_bm_set_read_only(pmd->bm);
-        up_write(&pmd->root_lock);
-}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
deleted file mode 100644
index 0cecc370288..00000000000
--- a/drivers/md/dm-thin-metadata.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright (C) 2010-2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#ifndef DM_THIN_METADATA_H
-#define DM_THIN_METADATA_H
-#include "persistent-data/dm-block-manager.h"
-#define THIN_METADATA_BLOCK_SIZE 4096
-/*
- * The metadata device is currently limited in size.
- *
- * We have one block of index, which can hold 255 index entries.  Each
- * index entry contains allocation info about 16k metadata blocks.
- */
-#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
-/*
- * A metadata device larger than 16GB triggers a warning.
- */
-#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
-/*----------------------------------------------------------------*/
-struct dm_pool_metadata;
-struct dm_thin_device;
-/*
- * Device identifier
- */
-typedef uint64_t dm_thin_id;
-/*
- * Reopens or creates a new, empty metadata volume.
- */
-struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
-                                               sector_t data_block_size,
-                                               bool format_device);
-int dm_pool_metadata_close(struct dm_pool_metadata *pmd);
-/*
- * Compat feature flags.  Any incompat flags beyond the ones
- * specified below will prevent use of the thin metadata.
- */
-#define THIN_FEATURE_COMPAT_SUPP          0UL
-#define THIN_FEATURE_COMPAT_RO_SUPP       0UL
-#define THIN_FEATURE_INCOMPAT_SUPP        0UL
-/*
- * Device creation/deletion.
- */
-int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev);
-/*
- * An internal snapshot.
- *
- * You can only snapshot a quiesced origin i.e. one that is either
- * suspended or not instanced at all.
- */
-int dm_pool_create_snap(struct dm_pool_metadata *pmd, dm_thin_id dev,
-                        dm_thin_id origin);
-/*
- * Deletes a virtual device from the metadata.  It _is_ safe to call this
- * when that device is open.  Operations on that device will just start
- * failing.  You still need to call close() on the device.
- */
-int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
-                               dm_thin_id dev);
-/*
- * Commits _all_ metadata changes: device creation, deletion, mapping
- * updates.
- */
-int dm_pool_commit_metadata(struct dm_pool_metadata *pmd);
-/*
- * Discards all uncommitted changes.  Rereads the superblock, rolling back
- * to the last good transaction.  Thin devices remain open.
- * dm_thin_aborted_changes() tells you if they had uncommitted changes.
- *
- * If this call fails it's only useful to call dm_pool_metadata_close().
- * All other methods will fail with -EINVAL.
- */
-int dm_pool_abort_metadata(struct dm_pool_metadata *pmd);
-/*
- * Set/get userspace transaction id.
- */
-int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
-                                        uint64_t current_id,
-                                        uint64_t new_id);
-int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
-                                        uint64_t *result);
-/*
- * Hold/get root for userspace transaction.
- *
- * The metadata snapshot is a copy of the current superblock (minus the
- * space maps).  Userland can access the data structures for READ
- * operations only.  A small performance hit is incurred by providing this
- * copy of the metadata to userland due to extra copy-on-write operations
- * on the metadata nodes.  Release this as soon as you finish with it.
- */
-int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd);
-int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd);
-int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
-                              dm_block_t *result);
-/*
- * Actions on a single virtual device.
- */
-/*
- * Opening the same device more than once will fail with -EBUSY.
- */
-int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
-                             struct dm_thin_device **td);
-int dm_pool_close_thin_device(struct dm_thin_device *td);
-dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
-struct dm_thin_lookup_result {
-        dm_block_t block;
-        unsigned shared:1;
-};
-/*
- * Returns:
- *   -EWOULDBLOCK iff @can_block is set and would block.
- *   -ENODATA iff that mapping is not present.
- *   0 success
- */
-int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
-                       int can_block, struct dm_thin_lookup_result *result);
-/*
- * Obtain an unused block.
- */
-int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result);
-/*
- * Insert or remove block.
- */
-int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
-                         dm_block_t data_block);
-int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
-/*
- * Queries.
- */
-bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
-bool dm_thin_aborted_changes(struct dm_thin_device *td);
-int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
-                                     dm_block_t *highest_mapped);
-int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result);
-int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd,
-                                 dm_block_t *result);
-int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
-                                          dm_block_t *result);
-int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
-                                  dm_block_t *result);
-int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
-int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
-/*
- * Returns -ENOSPC if the new size is too small and already allocated
- * blocks would be lost.
- */
-int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
-/*
- * Flicks the underlying block manager into read only mode, so you know
- * that nothing is changing.
- */
-void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
-/*----------------------------------------------------------------*/
-#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
deleted file mode 100644
index 675ae527401..00000000000
--- a/drivers/md/dm-thin.c
+++ /dev/null
@@ -1,2818 +0,0 @@
-/*
- * Copyright (C) 2011-2012 Red Hat UK.
- *
- * This file is released under the GPL.
- */
-#include "dm-thin-metadata.h"
-#include "dm-bio-prison.h"
-#include "dm.h"
-#include <linux/device-mapper.h>
-#include <linux/dm-io.h>
-#include <linux/dm-kcopyd.h>
-#include <linux/list.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#define DM_MSG_PREFIX   "thin"
-/*
- * Tunable constants
- */
-#define ENDIO_HOOK_POOL_SIZE 1024
-#define MAPPING_POOL_SIZE 1024
-#define PRISON_CELLS 1024
-#define COMMIT_PERIOD HZ
-/*
- * The block size of the device holding pool data must be
- * between 64KB and 1GB.
- */
-#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
-#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
-/*
- * Device id is restricted to 24 bits.
- */
-#define MAX_DEV_ID ((1 << 24) - 1)
-/*
- * How do we handle breaking sharing of data blocks?
- * =================================================
- *
- * We use a standard copy-on-write btree to store the mappings for the
- * devices (note I'm talking about copy-on-write of the metadata here, not
- * the data).  When you take an internal snapshot you clone the root node
- * of the origin btree.  After this there is no concept of an origin or a
- * snapshot.  They are just two device trees that happen to point to the
- * same data blocks.
- *
- * When we get a write in we decide if it's to a shared data block using
- * some timestamp magic.  If it is, we have to break sharing.
- *
- * Let's say we write to a shared block in what was the origin.  The
- * steps are:
- *
- * i) plug io further to this physical block. (see bio_prison code).
- *
- * ii) quiesce any read io to that shared data block.  Obviously
- * including all devices that share this block.  (see dm_deferred_set code)
- *
- * iii) copy the data block to a newly allocate block.  This step can be
- * missed out if the io covers the block. (schedule_copy).
- *
- * iv) insert the new mapping into the origin's btree
- * (process_prepared_mapping).  This act of inserting breaks some
- * sharing of btree nodes between the two devices.  Breaking sharing only
- * effects the btree of that specific device.  Btrees for the other
- * devices that share the block never change.  The btree for the origin
- * device as it was after the last commit is untouched, ie. we're using
- * persistent data structures in the functional programming sense.
- *
- * v) unplug io to this physical block, including the io that triggered
- * the breaking of sharing.
- *
- * Steps (ii) and (iii) occur in parallel.
- *
- * The metadata _doesn't_ need to be committed before the io continues.  We
- * get away with this because the io is always written to a _new_ block.
- * If there's a crash, then:
- *
- * - The origin mapping will point to the old origin block (the shared
- * one).  This will contain the data as it was before the io that triggered
- * the breaking of sharing came in.
- *
- * - The snap mapping still points to the old block.  As it would after
- * the commit.
- *
- * The downside of this scheme is the timestamp magic isn't perfect, and
- * will continue to think that data block in the snapshot device is shared
- * even after the write to the origin has broken sharing.  I suspect data
- * blocks will typically be shared by many different devices, so we're
- * breaking sharing n + 1 times, rather than n, where n is the number of
- * devices that reference this data block.  At the moment I think the
- * benefits far, far outweigh the disadvantages.
- */
-/*----------------------------------------------------------------*/
-/*
- * Key building.
- */
-static void build_data_key(struct dm_thin_device *td,
-                           dm_block_t b, struct dm_cell_key *key)
-{
-        key->virtual = 0;
-        key->dev = dm_thin_dev_id(td);
-        key->block = b;
-}
-static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
-                              struct dm_cell_key *key)
-{
-        key->virtual = 1;
-        key->dev = dm_thin_dev_id(td);
-        key->block = b;
-}
-/*----------------------------------------------------------------*/
-/*
- * A pool device ties together a metadata device and a data device.  It
- * also provides the interface for creating and destroying internal
- * devices.
- */
-struct dm_thin_new_mapping;
-/*
- * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
- */
-enum pool_mode {
-        PM_WRITE,               /* metadata may be changed */
-        PM_READ_ONLY,           /* metadata may not be changed */
-        PM_FAIL,                /* all I/O fails */
-};
-struct pool_features {
-        enum pool_mode mode;
-        bool zero_new_blocks:1;
-        bool discard_enabled:1;
-        bool discard_passdown:1;
-};
-struct thin_c;
-typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
-typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
-struct pool {
-        struct list_head list;
-        struct dm_target *ti;   /* Only set if a pool target is bound */
-        struct mapped_device *pool_md;
-        struct block_device *md_dev;
-        struct dm_pool_metadata *pmd;
-        dm_block_t low_water_blocks;
-        uint32_t sectors_per_block;
-        int sectors_per_block_shift;
-        struct pool_features pf;
-        unsigned low_water_triggered:1; /* A dm event has been sent */
-        unsigned no_free_space:1;       /* A -ENOSPC warning has been issued */
-        struct dm_bio_prison *prison;
-        struct dm_kcopyd_client *copier;
-        struct workqueue_struct *wq;
-        struct work_struct worker;
-        struct delayed_work waker;
-        unsigned long last_commit_jiffies;
-        unsigned ref_count;
-        spinlock_t lock;
-        struct bio_list deferred_bios;
-        struct bio_list deferred_flush_bios;
-        struct list_head prepared_mappings;
-        struct list_head prepared_discards;
-        struct bio_list retry_on_resume_list;
-        struct dm_deferred_set *shared_read_ds;
-        struct dm_deferred_set *all_io_ds;
-        struct dm_thin_new_mapping *next_mapping;
-        mempool_t *mapping_pool;
-        process_bio_fn process_bio;
-        process_bio_fn process_discard;
-        process_mapping_fn process_prepared_mapping;
-        process_mapping_fn process_prepared_discard;
-};
-static enum pool_mode get_pool_mode(struct pool *pool);
-static void set_pool_mode(struct pool *pool, enum pool_mode mode);
-/*
- * Target context for a pool.
- */
-struct pool_c {
-        struct dm_target *ti;
-        struct pool *pool;
-        struct dm_dev *data_dev;
-        struct dm_dev *metadata_dev;
-        struct dm_target_callbacks callbacks;
-        dm_block_t low_water_blocks;
-        struct pool_features requested_pf; /* Features requested during table load */
-        struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
-};
-/*
- * Target context for a thin.
- */
-struct thin_c {
-        struct dm_dev *pool_dev;
-        struct dm_dev *origin_dev;
-        dm_thin_id dev_id;
-        struct pool *pool;
-        struct dm_thin_device *td;
-};
-/*----------------------------------------------------------------*/
-/*
- * A global list of pools that uses a struct mapped_device as a key.
- */
-static struct dm_thin_pool_table {
-        struct mutex mutex;
-        struct list_head pools;
-} dm_thin_pool_table;
-static void pool_table_init(void)
-{
-        mutex_init(&dm_thin_pool_table.mutex);
-        INIT_LIST_HEAD(&dm_thin_pool_table.pools);
-}
-static void __pool_table_insert(struct pool *pool)
-{
-        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
-        list_add(&pool->list, &dm_thin_pool_table.pools);
-}
-static void __pool_table_remove(struct pool *pool)
-{
-        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
-        list_del(&pool->list);
-}
-static struct pool *__pool_table_lookup(struct mapped_device *md)
-{
-        struct pool *pool = NULL, *tmp;
-        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
-        list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
-                if (tmp->pool_md == md) {
-                        pool = tmp;
-                        break;
-                }
-        }
-        return pool;
-}
-static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
-{
-        struct pool *pool = NULL, *tmp;
-        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
-        list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
-                if (tmp->md_dev == md_dev) {
-                        pool = tmp;
-                        break;
-                }
-        }
-        return pool;
-}
-/*----------------------------------------------------------------*/
-struct dm_thin_endio_hook {
-        struct thin_c *tc;
-        struct dm_deferred_entry *shared_read_entry;
-        struct dm_deferred_entry *all_io_entry;
-        struct dm_thin_new_mapping *overwrite_mapping;
-};
-static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
-{
-        struct bio *bio;
-        struct bio_list bios;
-        bio_list_init(&bios);
-        bio_list_merge(&bios, master);
-        bio_list_init(master);
-        while ((bio = bio_list_pop(&bios))) {
-                struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-                if (h->tc == tc)
-                        bio_endio(bio, DM_ENDIO_REQUEUE);
-                else
-                        bio_list_add(master, bio);
-        }
-}
-static void requeue_io(struct thin_c *tc)
-{
-        struct pool *pool = tc->pool;
-        unsigned long flags;
-        spin_lock_irqsave(&pool->lock, flags);
-        __requeue_bio_list(tc, &pool->deferred_bios);
-        __requeue_bio_list(tc, &pool->retry_on_resume_list);
-        spin_unlock_irqrestore(&pool->lock, flags);
-}
-/*
- * This section of code contains the logic for processing a thin device's IO.
- * Much of the code depends on pool object resources (lists, workqueues, etc)
- * but most is exclusively called from the thin target rather than the thin-pool
- * target.
- */
-static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
-{
-        sector_t block_nr = bio->bi_sector;
-        if (tc->pool->sectors_per_block_shift < 0)
-                (void) sector_div(block_nr, tc->pool->sectors_per_block);
-        else
-                block_nr >>= tc->pool->sectors_per_block_shift;
-        return block_nr;
-}
-static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
-{
-        struct pool *pool = tc->pool;
-        sector_t bi_sector = bio->bi_sector;
-        bio->bi_bdev = tc->pool_dev->bdev;
-        if (tc->pool->sectors_per_block_shift < 0)
-                bio->bi_sector = (block * pool->sectors_per_block) +
-                                 sector_div(bi_sector, pool->sectors_per_block);
-        else
-                bio->bi_sector = (block << pool->sectors_per_block_shift) |
-                                (bi_sector & (pool->sectors_per_block - 1));
-}
-static void remap_to_origin(struct thin_c *tc, struct bio *bio)
-{
-        bio->bi_bdev = tc->origin_dev->bdev;
-}
-static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
-{
-        return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
-                dm_thin_changed_this_transaction(tc->td);
-}
-static void inc_all_io_entry(struct pool *pool, struct bio *bio)
-{
-        struct dm_thin_endio_hook *h;
-        if (bio->bi_rw & REQ_DISCARD)
-                return;
-        h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-        h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
-}
-static void issue(struct thin_c *tc, struct bio *bio)
-{
-        struct pool *pool = tc->pool;
-        unsigned long flags;
-        if (!bio_triggers_commit(tc, bio)) {
-                generic_make_request(bio);
-                return;
-        }
-        /*
-         * Complete bio with an error if earlier I/O caused changes to
-         * the metadata that can't be committed e.g, due to I/O errors
-         * on the metadata device.
-         */
-        if (dm_thin_aborted_changes(tc->td)) {
-                bio_io_error(bio);
-                return;
-        }
-        /*
-         * Batch together any bios that trigger commits and then issue a
-         * single commit for them in process_deferred_bios().
-         */
-        spin_lock_irqsave(&pool->lock, flags);
-        bio_list_add(&pool->deferred_flush_bios, bio);
-        spin_unlock_irqrestore(&pool->lock, flags);
-}
-static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
-{
-        remap_to_origin(tc, bio);
-        issue(tc, bio);
-}
-static void remap_and_issue(struct thin_c *tc, struct bio *bio,
-                            dm_block_t block)
-{
-        remap(tc, bio, block);
-        issue(tc, bio);
-}
-/*
- * wake_worker() is used when new work is queued and when pool_resume is
- * ready to continue deferred IO processing.
- */
-static void wake_worker(struct pool *pool)
-{
-        queue_work(pool->wq, &pool->worker);
-}
-/*----------------------------------------------------------------*/
-/*
- * Bio endio functions.
- */
-struct dm_thin_new_mapping {
-        struct list_head list;
-        unsigned quiesced:1;
-        unsigned prepared:1;
-        unsigned pass_discard:1;
-        struct thin_c *tc;
-        dm_block_t virt_block;
-        dm_block_t data_block;
-        struct dm_bio_prison_cell *cell, *cell2;
-        int err;
-        /*
-         * If the bio covers the whole area of a block then we can avoid
-         * zeroing or copying.  Instead this bio is hooked.  The bio will
-         * still be in the cell, so care has to be taken to avoid issuing
-         * the bio twice.
-         */
-        struct bio *bio;
-        bio_end_io_t *saved_bi_end_io;
-};
-static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
-{
-        struct pool *pool = m->tc->pool;
-        if (m->quiesced && m->prepared) {
-                list_add(&m->list, &pool->prepared_mappings);
-                wake_worker(pool);
-        }
-}
-static void copy_complete(int read_err, unsigned long write_err, void *context)
-{
-        unsigned long flags;
-        struct dm_thin_new_mapping *m = context;
-        struct pool *pool = m->tc->pool;
-        m->err = read_err || write_err ? -EIO : 0;
-        spin_lock_irqsave(&pool->lock, flags);
-        m->prepared = 1;
-        __maybe_add_mapping(m);
-        spin_unlock_irqrestore(&pool->lock, flags);
-}
-static void overwrite_endio(struct bio *bio, int err)
-{
-        unsigned long flags;
-        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-        struct dm_thin_new_mapping *m = h->overwrite_mapping;
-        struct pool *pool = m->tc->pool;
-        m->err = err;
-        spin_lock_irqsave(&pool->lock, flags);
-        m->prepared = 1;
-        __maybe_add_mapping(m);
-        spin_unlock_irqrestore(&pool->lock, flags);
-}
-/*----------------------------------------------------------------*/
-/*
- * Workqueue.
- */
-/*
- * Prepared mapping jobs.
- */
-/*
- * This sends the bios in the cell back to the deferred_bios list.
- */
-static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
-{
-        struct pool *pool = tc->pool;
-        unsigned long flags;
-        spin_lock_irqsave(&pool->lock, flags);
-        dm_cell_release(cell, &pool->deferred_bios);
-        spin_unlock_irqrestore(&tc->pool->lock, flags);
-        wake_worker(pool);
-}
-/*
- * Same as cell_defer except it omits the original holder of the cell.
- */
-static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
-{
-        struct pool *pool = tc->pool;
-        unsigned long flags;
-        spin_lock_irqsave(&pool->lock, flags);
-        dm_cell_release_no_holder(cell, &pool->deferred_bios);
-        spin_unlock_irqrestore(&pool->lock, flags);
-        wake_worker(pool);
-}
-static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
-{
-        if (m->bio)
-                m->bio->bi_end_io = m->saved_bi_end_io;
-        dm_cell_error(m->cell);
-        list_del(&m->list);
-        mempool_free(m, m->tc->pool->mapping_pool);
-}
-static void process_prepared_mapping(struct dm_thin_new_mapping *m)
-{
-        struct thin_c *tc = m->tc;
-        struct bio *bio;
-        int r;
-        bio = m->bio;
-        if (bio)
-                bio->bi_end_io = m->saved_bi_end_io;
-        if (m->err) {
-                dm_cell_error(m->cell);
-                goto out;
-        }
-        /*
-         * Commit the prepared block into the mapping btree.
-         * Any I/O for this block arriving after this point will get
-         * remapped to it directly.
-         */
-        r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
-        if (r) {
-                DMERR_LIMIT("dm_thin_insert_block() failed");
-                dm_cell_error(m->cell);
-                goto out;
-        }
-        /*
-         * Release any bios held while the block was being provisioned.
-         * If we are processing a write bio that completely covers the block,
-         * we already processed it so can ignore it now when processing
-         * the bios in the cell.
-         */
-        if (bio) {
-                cell_defer_no_holder(tc, m->cell);
-                bio_endio(bio, 0);
-        } else
-                cell_defer(tc, m->cell);
-out:
-        list_del(&m->list);
-        mempool_free(m, tc->pool->mapping_pool);
-}
-static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
-{
-        struct thin_c *tc = m->tc;
-        bio_io_error(m->bio);
-        cell_defer_no_holder(tc, m->cell);
-        cell_defer_no_holder(tc, m->cell2);
-        mempool_free(m, tc->pool->mapping_pool);
-}
-static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
-{
-        struct thin_c *tc = m->tc;
-        inc_all_io_entry(tc->pool, m->bio);
-        cell_defer_no_holder(tc, m->cell);
-        cell_defer_no_holder(tc, m->cell2);
-        if (m->pass_discard)
-                remap_and_issue(tc, m->bio, m->data_block);
-        else
-                bio_endio(m->bio, 0);
-        mempool_free(m, tc->pool->mapping_pool);
-}
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
-{
-        int r;
-        struct thin_c *tc = m->tc;
-        r = dm_thin_remove_block(tc->td, m->virt_block);
-        if (r)
-                DMERR_LIMIT("dm_thin_remove_block() failed");
-        process_prepared_discard_passdown(m);
-}
-static void process_prepared(struct pool *pool, struct list_head *head,
-                             process_mapping_fn *fn)
-{
-        unsigned long flags;
-        struct list_head maps;
-        struct dm_thin_new_mapping *m, *tmp;
-        INIT_LIST_HEAD(&maps);
-        spin_lock_irqsave(&pool->lock, flags);
-        list_splice_init(head, &maps);
-        spin_unlock_irqrestore(&pool->lock, flags);
-        list_for_each_entry_safe(m, tmp, &maps, list)
-                (*fn)(m);
-}
-/*
- * Deferred bio jobs.
- */
-static int io_overlaps_block(struct pool *pool, struct bio *bio)
-{
-        return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
-}
-static int io_overwrites_block(struct pool *pool, struct bio *bio)
-{
-        return (bio_data_dir(bio) == WRITE) &&
-                io_overlaps_block(pool, bio);
-}
-static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
-                               bio_end_io_t *fn)
-{
-        *save = bio->bi_end_io;
-        bio->bi_end_io = fn;
-}
-static int ensure_next_mapping(struct pool *pool)
-{
-        if (pool->next_mapping)
-                return 0;
-        pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
-        return pool->next_mapping ? 0 : -ENOMEM;
-}
-static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
-{
-        struct dm_thin_new_mapping *r = pool->next_mapping;
-        BUG_ON(!pool->next_mapping);
-        pool->next_mapping = NULL;
-        return r;
-}
-static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
-                          struct dm_dev *origin, dm_block_t data_origin,
-                          dm_block_t data_dest,
-                          struct dm_bio_prison_cell *cell, struct bio *bio)
-{
-        int r;
-        struct pool *pool = tc->pool;
-        struct dm_thin_new_mapping *m = get_next_mapping(pool);
-        INIT_LIST_HEAD(&m->list);
-        m->quiesced = 0;
-        m->prepared = 0;
-        m->tc = tc;
-        m->virt_block = virt_block;
-        m->data_block = data_dest;
-        m->cell = cell;
-        m->err = 0;
-        m->bio = NULL;
-        if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
-                m->quiesced = 1;
-        /*
-         * IO to pool_dev remaps to the pool target's data_dev.
-         *
-         * If the whole block of data is being overwritten, we can issue the
-         * bio immediately. Otherwise we use kcopyd to clone the data first.
-         */
-        if (io_overwrites_block(pool, bio)) {
-                struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-                h->overwrite_mapping = m;
-                m->bio = bio;
-                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-                inc_all_io_entry(pool, bio);
-                remap_and_issue(tc, bio, data_dest);
-        } else {
-                struct dm_io_region from, to;
-                from.bdev = origin->bdev;
-                from.sector = data_origin * pool->sectors_per_block;
-                from.count = pool->sectors_per_block;
-                to.bdev = tc->pool_dev->bdev;
-                to.sector = data_dest * pool->sectors_per_block;
-                to.count = pool->sectors_per_block;
-                r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
-                                   0, copy_complete, m);
-                if (r < 0) {
-                        mempool_free(m, pool->mapping_pool);
-                        DMERR_LIMIT("dm_kcopyd_copy() failed");
-                        dm_cell_error(cell);
-                }
-        }
-}
-static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
-                                   dm_block_t data_origin, dm_block_t data_dest,
-                                   struct dm_bio_prison_cell *cell, struct bio *bio)
-{
-        schedule_copy(tc, virt_block, tc->pool_dev,
-                      data_origin, data_dest, cell, bio);
-}
-static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
-                                   dm_block_t data_dest,
-                                   struct dm_bio_prison_cell *cell, struct bio *bio)
-{
-        schedule_copy(tc, virt_block, tc->origin_dev,
-                      virt_block, data_dest, cell, bio);
-}
-static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
-                          dm_block_t data_block, struct dm_bio_prison_cell *cell,
-                          struct bio *bio)
-{
-        struct pool *pool = tc->pool;
-        struct dm_thin_new_mapping *m = get_next_mapping(pool);
-        INIT_LIST_HEAD(&m->list);
-        m->quiesced = 1;
-        m->prepared = 0;
-        m->tc = tc;
-        m->virt_block = virt_block;
-        m->data_block = data_block;
-        m->cell = cell;
-        m->err = 0;
-        m->bio = NULL;
-        /*
-         * If the whole block of data is being overwritten or we are not
-         * zeroing pre-existing data, we can issue the bio immediately.
-         * Otherwise we use kcopyd to zero the data first.
-         */
-        if (!pool->pf.zero_new_blocks)
-                process_prepared_mapping(m);
-        else if (io_overwrites_block(pool, bio)) {
-                struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-                h->overwrite_mapping = m;
-                m->bio = bio;
-                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-                inc_all_io_entry(pool, bio);
-                remap_and_issue(tc, bio, data_block);
-        } else {
-                int r;
-                struct dm_io_region to;
-                to.bdev = tc->pool_dev->bdev;
-                to.sector = data_block * pool->sectors_per_block;
-                to.count = pool->sectors_per_block;
-                r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
-                if (r < 0) {
-                        mempool_free(m, pool->mapping_pool);
-                        DMERR_LIMIT("dm_kcopyd_zero() failed");
-                        dm_cell_error(cell);
-                }
-        }
-}
-static int commit(struct pool *pool)
-{
-        int r;
-        r = dm_pool_commit_metadata(pool->pmd);
-        if (r)
-                DMERR_LIMIT("commit failed: error = %d", r);
-        return r;
-}
-/*
- * A non-zero return indicates read_only or fail_io mode.
- * Many callers don't care about the return value.
- */
-static int commit_or_fallback(struct pool *pool)
-{
-        int r;
-        if (get_pool_mode(pool) != PM_WRITE)
-                return -EINVAL;
-        r = commit(pool);
-        if (r)
-                set_pool_mode(pool, PM_READ_ONLY);
-        return r;
-}
-static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
-{
-        int r;
-        dm_block_t free_blocks;
-        unsigned long flags;
-        struct pool *pool = tc->pool;
-        r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
-        if (r)
-                return r;
-        if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
-                DMWARN("%s: reached low water mark, sending event.",
-                       dm_device_name(pool->pool_md));
-                spin_lock_irqsave(&pool->lock, flags);
-                pool->low_water_triggered = 1;
-                spin_unlock_irqrestore(&pool->lock, flags);
-                dm_table_event(pool->ti->table);
-        }
-        if (!free_blocks) {
-                if (pool->no_free_space)
-                        return -ENOSPC;
-                else {
-                        /*
-                         * Try to commit to see if that will free up some
-                         * more space.
-                         */
-                        (void) commit_or_fallback(pool);
-                        r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
-                        if (r)
-                                return r;
-                        /*
-                         * If we still have no space we set a flag to avoid
-                         * doing all this checking and return -ENOSPC.
-                         */
-                        if (!free_blocks) {
-                                DMWARN("%s: no free space available.",
-                                       dm_device_name(pool->pool_md));
-                                spin_lock_irqsave(&pool->lock, flags);
-                                pool->no_free_space = 1;
-                                spin_unlock_irqrestore(&pool->lock, flags);
-                                return -ENOSPC;
-                        }
-                }
-        }
-        r = dm_pool_alloc_data_block(pool->pmd, result);
-        if (r)
-                return r;
-        return 0;
-}
-/*
- * If we have run out of space, queue bios until the device is
- * resumed, presumably after having been reloaded with more space.
- */
-static void retry_on_resume(struct bio *bio)
-{
-        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-        struct thin_c *tc = h->tc;
-        struct pool *pool = tc->pool;
-        unsigned long flags;
-        spin_lock_irqsave(&pool->lock, flags);
-        bio_list_add(&pool->retry_on_resume_list, bio);
-        spin_unlock_irqrestore(&pool->lock, flags);
-}
-static void no_space(struct dm_bio_prison_cell *cell)
-{
-        struct bio *bio;
-        struct bio_list bios;
-        bio_list_init(&bios);
-        dm_cell_release(cell, &bios);
-        while ((bio = bio_list_pop(&bios)))
-                retry_on_resume(bio);
-}
-static void process_discard(struct thin_c *tc, struct bio *bio)
-{
-        int r;
-        unsigned long flags;
-        struct pool *pool = tc->pool;
-        struct dm_bio_prison_cell *cell, *cell2;
-        struct dm_cell_key key, key2;
-        dm_block_t block = get_bio_block(tc, bio);
-        struct dm_thin_lookup_result lookup_result;
-        struct dm_thin_new_mapping *m;
-        build_virtual_key(tc->td, block, &key);
-        if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
-                return;
-        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
-        switch (r) {
-        case 0:
-                /*
-                 * Check nobody is fiddling with this pool block.  This can
-                 * happen if someone's in the process of breaking sharing
-                 * on this block.
-                 */
-                build_data_key(tc->td, lookup_result.block, &key2);
-                if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
-                        cell_defer_no_holder(tc, cell);
-                        break;
-                }
-                if (io_overlaps_block(pool, bio)) {
-                        /*
-                         * IO may still be going to the destination block.  We must
-                         * quiesce before we can do the removal.
-                         */
-                        m = get_next_mapping(pool);
-                        m->tc = tc;
-                        m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
-                        m->virt_block = block;
-                        m->data_block = lookup_result.block;
-                        m->cell = cell;
-                        m->cell2 = cell2;
-                        m->err = 0;
-                        m->bio = bio;
-                        if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
-                                spin_lock_irqsave(&pool->lock, flags);
-                                list_add(&m->list, &pool->prepared_discards);
-                                spin_unlock_irqrestore(&pool->lock, flags);
-                                wake_worker(pool);
-                        }
-                } else {
-                        inc_all_io_entry(pool, bio);
-                        cell_defer_no_holder(tc, cell);
-                        cell_defer_no_holder(tc, cell2);
-                        /*
-                         * The DM core makes sure that the discard doesn't span
-                         * a block boundary.  So we submit the discard of a
-                         * partial block appropriately.
-                         */
-                        if ((!lookup_result.shared) && pool->pf.discard_passdown)
-                                remap_and_issue(tc, bio, lookup_result.block);
-                        else
-                                bio_endio(bio, 0);
-                }
-                break;
-        case -ENODATA:
-                /*
-                 * It isn't provisioned, just forget it.
-                 */
-                cell_defer_no_holder(tc, cell);
-                bio_endio(bio, 0);
-                break;
-        default:
-                DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
-                            __func__, r);
-                cell_defer_no_holder(tc, cell);
-                bio_io_error(bio);
-                break;
-        }
-}
-static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
-                          struct dm_cell_key *key,
-                          struct dm_thin_lookup_result *lookup_result,
-                          struct dm_bio_prison_cell *cell)
-{
-        int r;
-        dm_block_t data_block;
-        r = alloc_data_block(tc, &data_block);
-        switch (r) {
-        case 0:
-                schedule_internal_copy(tc, block, lookup_result->block,
-                                       data_block, cell, bio);
-                break;
-        case -ENOSPC:
-                no_space(cell);
-                break;
-        default:
-                DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
-                            __func__, r);
-                dm_cell_error(cell);
-                break;
-        }
-}
-static void process_shared_bio(struct thin_c *tc, struct bio *bio,
-                               dm_block_t block,
-                               struct dm_thin_lookup_result *lookup_result)
-{
-        struct dm_bio_prison_cell *cell;
-        struct pool *pool = tc->pool;
-        struct dm_cell_key key;
-        /*
-         * If cell is already occupied, then sharing is already in the process
-         * of being broken so we have nothing further to do here.
-         */
-        build_data_key(tc->td, lookup_result->block, &key);
-        if (dm_bio_detain(pool->prison, &key, bio, &cell))
-                return;
-        if (bio_data_dir(bio) == WRITE && bio->bi_size)
-                break_sharing(tc, bio, block, &key, lookup_result, cell);
-        else {
-                struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-                h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
-                inc_all_io_entry(pool, bio);
-                cell_defer_no_holder(tc, cell);
-                remap_and_issue(tc, bio, lookup_result->block);
-        }
-}
-static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
-                            struct dm_bio_prison_cell *cell)
-{
-        int r;
-        dm_block_t data_block;
-        /*
-         * Remap empty bios (flushes) immediately, without provisioning.
-         */
-        if (!bio->bi_size) {
-                inc_all_io_entry(tc->pool, bio);
-                cell_defer_no_holder(tc, cell);
-                remap_and_issue(tc, bio, 0);
-                return;
-        }
-        /*
-         * Fill read bios with zeroes and complete them immediately.
-         */
-        if (bio_data_dir(bio) == READ) {
-                zero_fill_bio(bio);
-                cell_defer_no_holder(tc, cell);
-                bio_endio(bio, 0);
-                return;
-        }
-        r = alloc_data_block(tc, &data_block);
-        switch (r) {
-        case 0:
-                if (tc->origin_dev)
-                        schedule_external_copy(tc, block, data_block, cell, bio);
-                else
-                        schedule_zero(tc, block, data_block, cell, bio);
-                break;
-        case -ENOSPC:
-                no_space(cell);
-                break;
-        default:
-                DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
-                            __func__, r);
-                set_pool_mode(tc->pool, PM_READ_ONLY);
-                dm_cell_error(cell);
-                break;
-        }
-}
-static void process_bio(struct thin_c *tc, struct bio *bio)
-{
-        int r;
-        dm_block_t block = get_bio_block(tc, bio);
-        struct dm_bio_prison_cell *cell;
-        struct dm_cell_key key;
-        struct dm_thin_lookup_result lookup_result;
-        /*
-         * If cell is already occupied, then the block is already
-         * being provisioned so we have nothing further to do here.
-         */
-        build_virtual_key(tc->td, block, &key);
-        if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
-                return;
-        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
-        switch (r) {
-        case 0:
-                if (lookup_result.shared) {
-                        process_shared_bio(tc, bio, block, &lookup_result);
-                        cell_defer_no_holder(tc, cell);
-                } else {
-                        inc_all_io_entry(tc->pool, bio);
-                        cell_defer_no_holder(tc, cell);
-                        remap_and_issue(tc, bio, lookup_result.block);
-                }
-                break;
-        case -ENODATA:
-                if (bio_data_dir(bio) == READ && tc->origin_dev) {
-                        inc_all_io_entry(tc->pool, bio);
-                        cell_defer_no_holder(tc, cell);
-                        remap_to_origin_and_issue(tc, bio);
-                } else
-                        provision_block(tc, bio, block, cell);
-                break;
-        default:
-                DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
-                            __func__, r);
-                cell_defer_no_holder(tc, cell);
-                bio_io_error(bio);
-                break;
-        }
-}
-static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
-{
-        int r;
-        int rw = bio_data_dir(bio);
-        dm_block_t block = get_bio_block(tc, bio);
-        struct dm_thin_lookup_result lookup_result;
-        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
-        switch (r) {
-        case 0:
-                if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
-                        bio_io_error(bio);
-                else {
-                        inc_all_io_entry(tc->pool, bio);
-                        remap_and_issue(tc, bio, lookup_result.block);
-                }
-                break;
-        case -ENODATA:
-                if (rw != READ) {
-                        bio_io_error(bio);
-                        break;
-                }
-                if (tc->origin_dev) {
-                        inc_all_io_entry(tc->pool, bio);
-                        remap_to_origin_and_issue(tc, bio);
-                        break;
-                }
-                zero_fill_bio(bio);
-                bio_endio(bio, 0);
-                break;
-        default:
-                DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
-                            __func__, r);
-                bio_io_error(bio);
-                break;
-        }
-}
-static void process_bio_fail(struct thin_c *tc, struct bio *bio)
-{
-        bio_io_error(bio);
-}
-static int need_commit_due_to_time(struct pool *pool)
-{
-        return jiffies < pool->last_commit_jiffies ||
-               jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
-}
-static void process_deferred_bios(struct pool *pool)
-{
-        unsigned long flags;
-        struct bio *bio;
-        struct bio_list bios;
-        bio_list_init(&bios);
-        spin_lock_irqsave(&pool->lock, flags);
-        bio_list_merge(&bios, &pool->deferred_bios);
-        bio_list_init(&pool->deferred_bios);
-        spin_unlock_irqrestore(&pool->lock, flags);
-        while ((bio = bio_list_pop(&bios))) {
-                struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-                struct thin_c *tc = h->tc;
-                /*
-                 * If we've got no free new_mapping structs, and processing
-                 * this bio might require one, we pause until there are some
-                 * prepared mappings to process.
-                 */
-                if (ensure_next_mapping(pool)) {
-                        spin_lock_irqsave(&pool->lock, flags);
-                        bio_list_merge(&pool->deferred_bios, &bios);
-                        spin_unlock_irqrestore(&pool->lock, flags);
-                        break;
-                }
-                if (bio->bi_rw & REQ_DISCARD)
-                        pool->process_discard(tc, bio);
-                else
-                        pool->process_bio(tc, bio);
-        }
-        /*
-         * If there are any deferred flush bios, we must commit
-         * the metadata before issuing them.
-         */
-        bio_list_init(&bios);
-        spin_lock_irqsave(&pool->lock, flags);
-        bio_list_merge(&bios, &pool->deferred_flush_bios);
-        bio_list_init(&pool->deferred_flush_bios);
-        spin_unlock_irqrestore(&pool->lock, flags);
-        if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
-                return;
-        if (commit_or_fallback(pool)) {
-                while ((bio = bio_list_pop(&bios)))
-                        bio_io_error(bio);
-                return;
-        }
-        pool->last_commit_jiffies = jiffies;
-        while ((bio = bio_list_pop(&bios)))
-                generic_make_request(bio);
-}
-static void do_worker(struct work_struct *ws)
-{
-        struct pool *pool = container_of(ws, struct pool, worker);
-        process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
-        process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
-        process_deferred_bios(pool);
-}
-/*
- * We want to commit periodically so that not too much
- * unwritten data builds up.
- */
-static void do_waker(struct work_struct *ws)
-{
-        struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
-        wake_worker(pool);
-        queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
-}
-/*----------------------------------------------------------------*/
-static enum pool_mode get_pool_mode(struct pool *pool)
-{
-        return pool->pf.mode;
-}
-static void set_pool_mode(struct pool *pool, enum pool_mode mode)
-{
-        int r;
-        pool->pf.mode = mode;
-        switch (mode) {
-        case PM_FAIL:
-                DMERR("switching pool to failure mode");
-                pool->process_bio = process_bio_fail;
-                pool->process_discard = process_bio_fail;
-                pool->process_prepared_mapping = process_prepared_mapping_fail;
-                pool->process_prepared_discard = process_prepared_discard_fail;
-                break;
-        case PM_READ_ONLY:
-                DMERR("switching pool to read-only mode");
-                r = dm_pool_abort_metadata(pool->pmd);
-                if (r) {
-                        DMERR("aborting transaction failed");
-                        set_pool_mode(pool, PM_FAIL);
-                } else {
-                        dm_pool_metadata_read_only(pool->pmd);
-                        pool->process_bio = process_bio_read_only;
-                        pool->process_discard = process_discard;
-                        pool->process_prepared_mapping = process_prepared_mapping_fail;
-                        pool->process_prepared_discard = process_prepared_discard_passdown;
-                }
-                break;
-        case PM_WRITE:
-                pool->process_bio = process_bio;
-                pool->process_discard = process_discard;
-                pool->process_prepared_mapping = process_prepared_mapping;
-                pool->process_prepared_discard = process_prepared_discard;
-                break;
-        }
-}
-/*----------------------------------------------------------------*/
-/*
- * Mapping functions.
- */
-/*
- * Called only while mapping a thin bio to hand it over to the workqueue.
- */
-static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
-{
-        unsigned long flags;
-        struct pool *pool = tc->pool;
-        spin_lock_irqsave(&pool->lock, flags);
-        bio_list_add(&pool->deferred_bios, bio);
-        spin_unlock_irqrestore(&pool->lock, flags);
-        wake_worker(pool);
-}
-static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
-{
-        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-        h->tc = tc;
-        h->shared_read_entry = NULL;
-        h->all_io_entry = NULL;
-        h->overwrite_mapping = NULL;
-}
-/*
- * Non-blocking function called from the thin target's map function.
- */
-static int thin_bio_map(struct dm_target *ti, struct bio *bio)
-{
-        int r;
-        struct thin_c *tc = ti->private;
-        dm_block_t block = get_bio_block(tc, bio);
-        struct dm_thin_device *td = tc->td;
-        struct dm_thin_lookup_result result;
-        struct dm_bio_prison_cell *cell1, *cell2;
-        struct dm_cell_key key;
-        thin_hook_bio(tc, bio);
-        if (get_pool_mode(tc->pool) == PM_FAIL) {
-                bio_io_error(bio);
-                return DM_MAPIO_SUBMITTED;
-        }
-        if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
-                thin_defer_bio(tc, bio);
-                return DM_MAPIO_SUBMITTED;
-        }
-        r = dm_thin_find_block(td, block, 0, &result);
-        /*
-         * Note that we defer readahead too.
-         */
-        switch (r) {
-        case 0:
-                if (unlikely(result.shared)) {
-                        /*
-                         * We have a race condition here between the
-                         * result.shared value returned by the lookup and
-                         * snapshot creation, which may cause new
-                         * sharing.
-                         *
-                         * To avoid this always quiesce the origin before
-                         * taking the snap.  You want to do this anyway to
-                         * ensure a consistent application view
-                         * (i.e. lockfs).
-                         *
-                         * More distant ancestors are irrelevant. The
-                         * shared flag will be set in their case.
-                         */
-                        thin_defer_bio(tc, bio);
-                        return DM_MAPIO_SUBMITTED;
-                }
-                build_virtual_key(tc->td, block, &key);
-                if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1))
-                        return DM_MAPIO_SUBMITTED;
-                build_data_key(tc->td, result.block, &key);
-                if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2)) {
-                        cell_defer_no_holder(tc, cell1);
-                        return DM_MAPIO_SUBMITTED;
-                }
-                inc_all_io_entry(tc->pool, bio);
-                cell_defer_no_holder(tc, cell2);
-                cell_defer_no_holder(tc, cell1);
-                remap(tc, bio, result.block);
-                return DM_MAPIO_REMAPPED;
-        case -ENODATA:
-                if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
-                        /*
-                         * This block isn't provisioned, and we have no way
-                         * of doing so.  Just error it.
-                         */
-                        bio_io_error(bio);
-                        return DM_MAPIO_SUBMITTED;
-                }
-                /* fall through */
-        case -EWOULDBLOCK:
-                /*
-                 * In future, the failed dm_thin_find_block above could
-                 * provide the hint to load the metadata into cache.
-                 */
-                thin_defer_bio(tc, bio);
-                return DM_MAPIO_SUBMITTED;
-        default:
-                /*
-                 * Must always call bio_io_error on failure.
-                 * dm_thin_find_block can fail with -EINVAL if the
-                 * pool is switched to fail-io mode.
-                 */
-                bio_io_error(bio);
-                return DM_MAPIO_SUBMITTED;
-        }
-}
-static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
-{
-        int r;
-        unsigned long flags;
-        struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
-        spin_lock_irqsave(&pt->pool->lock, flags);
-        r = !bio_list_empty(&pt->pool->retry_on_resume_list);
-        spin_unlock_irqrestore(&pt->pool->lock, flags);
-        if (!r) {
-                struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
-                r = bdi_congested(&q->backing_dev_info, bdi_bits);
-        }
-        return r;
-}
-static void __requeue_bios(struct pool *pool)
-{
-        bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
-        bio_list_init(&pool->retry_on_resume_list);
-}
-/*----------------------------------------------------------------
- * Binding of control targets to a pool object
- *--------------------------------------------------------------*/
-static bool data_dev_supports_discard(struct pool_c *pt)
-{
-        struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
-        return q && blk_queue_discard(q);
-}
-/*
- * If discard_passdown was enabled verify that the data device
- * supports discards.  Disable discard_passdown if not.
- */
-static void disable_passdown_if_not_supported(struct pool_c *pt)
-{
-        struct pool *pool = pt->pool;
-        struct block_device *data_bdev = pt->data_dev->bdev;
-        struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
-        sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
-        const char *reason = NULL;
-        char buf[BDEVNAME_SIZE];
-        if (!pt->adjusted_pf.discard_passdown)
-                return;
-        if (!data_dev_supports_discard(pt))
-                reason = "discard unsupported";
-        else if (data_limits->max_discard_sectors < pool->sectors_per_block)
-                reason = "max discard sectors smaller than a block";
-        else if (data_limits->discard_granularity > block_size)
-                reason = "discard granularity larger than a block";
-        else if (block_size & (data_limits->discard_granularity - 1))
-                reason = "discard granularity not a factor of block size";
-        if (reason) {
-                DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
-                pt->adjusted_pf.discard_passdown = false;
-        }
-}
-static int bind_control_target(struct pool *pool, struct dm_target *ti)
-{
-        struct pool_c *pt = ti->private;
-        /*
-         * We want to make sure that degraded pools are never upgraded.
-         */
-        enum pool_mode old_mode = pool->pf.mode;
-        enum pool_mode new_mode = pt->adjusted_pf.mode;
-        if (old_mode > new_mode)
-                new_mode = old_mode;
-        pool->ti = ti;
-        pool->low_water_blocks = pt->low_water_blocks;
-        pool->pf = pt->adjusted_pf;
-        set_pool_mode(pool, new_mode);
-        return 0;
-}
-static void unbind_control_target(struct pool *pool, struct dm_target *ti)
-{
-        if (pool->ti == ti)
-                pool->ti = NULL;
-}
-/*----------------------------------------------------------------
- * Pool creation
- *--------------------------------------------------------------*/
-/* Initialize pool features. */
-static void pool_features_init(struct pool_features *pf)
-{
-        pf->mode = PM_WRITE;
-        pf->zero_new_blocks = true;
-        pf->discard_enabled = true;
-        pf->discard_passdown = true;
-}
-static void __pool_destroy(struct pool *pool)
-{
-        __pool_table_remove(pool);
-        if (dm_pool_metadata_close(pool->pmd) < 0)
-                DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
-        dm_bio_prison_destroy(pool->prison);
-        dm_kcopyd_client_destroy(pool->copier);
-        if (pool->wq)
-                destroy_workqueue(pool->wq);
-        if (pool->next_mapping)
-                mempool_free(pool->next_mapping, pool->mapping_pool);
-        mempool_destroy(pool->mapping_pool);
-        dm_deferred_set_destroy(pool->shared_read_ds);
-        dm_deferred_set_destroy(pool->all_io_ds);
-        kfree(pool);
-}
-static struct kmem_cache *_new_mapping_cache;
-static struct pool *pool_create(struct mapped_device *pool_md,
-                                struct block_device *metadata_dev,
-                                unsigned long block_size,
-                                int read_only, char **error)
-{
-        int r;
-        void *err_p;
-        struct pool *pool;
-        struct dm_pool_metadata *pmd;
-        bool format_device = read_only ? false : true;
-        pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
-        if (IS_ERR(pmd)) {
-                *error = "Error creating metadata object";
-                return (struct pool *)pmd;
-        }
-        pool = kmalloc(sizeof(*pool), GFP_KERNEL);
-        if (!pool) {
-                *error = "Error allocating memory for pool";
-                err_p = ERR_PTR(-ENOMEM);
-                goto bad_pool;
-        }
-        pool->pmd = pmd;
-        pool->sectors_per_block = block_size;
-        if (block_size & (block_size - 1))
-                pool->sectors_per_block_shift = -1;
-        else
-                pool->sectors_per_block_shift = __ffs(block_size);
-        pool->low_water_blocks = 0;
-        pool_features_init(&pool->pf);
-        pool->prison = dm_bio_prison_create(PRISON_CELLS);
-        if (!pool->prison) {
-                *error = "Error creating pool's bio prison";
-                err_p = ERR_PTR(-ENOMEM);
-                goto bad_prison;
-        }
-        pool->copier = dm_kcopyd_client_create();
-        if (IS_ERR(pool->copier)) {
-                r = PTR_ERR(pool->copier);
-                *error = "Error creating pool's kcopyd client";
-                err_p = ERR_PTR(r);
-                goto bad_kcopyd_client;
-        }
-        /*
-         * Create singlethreaded workqueue that will service all devices
-         * that use this metadata.
-         */
-        pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
-        if (!pool->wq) {
-                *error = "Error creating pool's workqueue";
-                err_p = ERR_PTR(-ENOMEM);
-                goto bad_wq;
-        }
-        INIT_WORK(&pool->worker, do_worker);
-        INIT_DELAYED_WORK(&pool->waker, do_waker);
-        spin_lock_init(&pool->lock);
-        bio_list_init(&pool->deferred_bios);
-        bio_list_init(&pool->deferred_flush_bios);
-        INIT_LIST_HEAD(&pool->prepared_mappings);
-        INIT_LIST_HEAD(&pool->prepared_discards);
-        pool->low_water_triggered = 0;
-        pool->no_free_space = 0;
-        bio_list_init(&pool->retry_on_resume_list);
-        pool->shared_read_ds = dm_deferred_set_create();
-        if (!pool->shared_read_ds) {
-                *error = "Error creating pool's shared read deferred set";
-                err_p = ERR_PTR(-ENOMEM);
-                goto bad_shared_read_ds;
-        }
-        pool->all_io_ds = dm_deferred_set_create();
-        if (!pool->all_io_ds) {
-                *error = "Error creating pool's all io deferred set";
-                err_p = ERR_PTR(-ENOMEM);
-                goto bad_all_io_ds;
-        }
-        pool->next_mapping = NULL;
-        pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
-                                                      _new_mapping_cache);
-        if (!pool->mapping_pool) {
-                *error = "Error creating pool's mapping mempool";
-                err_p = ERR_PTR(-ENOMEM);
-                goto bad_mapping_pool;
-        }
-        pool->ref_count = 1;
-        pool->last_commit_jiffies = jiffies;
-        pool->pool_md = pool_md;
-        pool->md_dev = metadata_dev;
-        __pool_table_insert(pool);
-        return pool;
-bad_mapping_pool:
-        dm_deferred_set_destroy(pool->all_io_ds);
-bad_all_io_ds:
-        dm_deferred_set_destroy(pool->shared_read_ds);
-bad_shared_read_ds:
-        destroy_workqueue(pool->wq);
-bad_wq:
-        dm_kcopyd_client_destroy(pool->copier);
-bad_kcopyd_client:
-        dm_bio_prison_destroy(pool->prison);
-bad_prison:
-        kfree(pool);
-bad_pool:
-        if (dm_pool_metadata_close(pmd))
-                DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
-        return err_p;
-}
-static void __pool_inc(struct pool *pool)
-{
-        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
-        pool->ref_count++;
-}
-static void __pool_dec(struct pool *pool)
-{
-        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
-        BUG_ON(!pool->ref_count);
-        if (!--pool->ref_count)
-                __pool_destroy(pool);
-}
-static struct pool *__pool_find(struct mapped_device *pool_md,
-                                struct block_device *metadata_dev,
-                                unsigned long block_size, int read_only,
-                                char **error, int *created)
-{
-        struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
-        if (pool) {
-                if (pool->pool_md != pool_md) {
-                        *error = "metadata device already in use by a pool";
-                        return ERR_PTR(-EBUSY);
-                }
-                __pool_inc(pool);
-        } else {
-                pool = __pool_table_lookup(pool_md);
-                if (pool) {
-                        if (pool->md_dev != metadata_dev) {
-                                *error = "different pool cannot replace a pool";
-                                return ERR_PTR(-EINVAL);
-                        }
-                        __pool_inc(pool);
-                } else {
-                        pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
-                        *created = 1;
-                }
-        }
-        return pool;
-}
-/*----------------------------------------------------------------
- * Pool target methods
- *--------------------------------------------------------------*/
-static void pool_dtr(struct dm_target *ti)
-{
-        struct pool_c *pt = ti->private;
-        mutex_lock(&dm_thin_pool_table.mutex);
-        unbind_control_target(pt->pool, ti);
-        __pool_dec(pt->pool);
-        dm_put_device(ti, pt->metadata_dev);
-        dm_put_device(ti, pt->data_dev);
-        kfree(pt);
-        mutex_unlock(&dm_thin_pool_table.mutex);
-}
-static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
-                               struct dm_target *ti)
-{
-        int r;
-        unsigned argc;
-        const char *arg_name;
-        static struct dm_arg _args[] = {
-                {0, 3, "Invalid number of pool feature arguments"},
-        };
-        /*
-         * No feature arguments supplied.
-         */
-        if (!as->argc)
-                return 0;
-        r = dm_read_arg_group(_args, as, &argc, &ti->error);
-        if (r)
-                return -EINVAL;
-        while (argc && !r) {
-                arg_name = dm_shift_arg(as);
-                argc--;
-                if (!strcasecmp(arg_name, "skip_block_zeroing"))
-                        pf->zero_new_blocks = false;
-                else if (!strcasecmp(arg_name, "ignore_discard"))
-                        pf->discard_enabled = false;
-                else if (!strcasecmp(arg_name, "no_discard_passdown"))
-                        pf->discard_passdown = false;
-                else if (!strcasecmp(arg_name, "read_only"))
-                        pf->mode = PM_READ_ONLY;
-                else {
-                        ti->error = "Unrecognised pool feature requested";
-                        r = -EINVAL;
-                        break;
-                }
-        }
-        return r;
-}
-/*
- * thin-pool <metadata dev> <data dev>
- *           <data block size (sectors)>
- *           <low water mark (blocks)>
- *           [<#feature args> [<arg>]*]
- *
- * Optional feature arguments are:
- *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
- *           ignore_discard: disable discard
- *           no_discard_passdown: don't pass discards down to the data device
- */
-static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
-{
-        int r, pool_created = 0;
-        struct pool_c *pt;
-        struct pool *pool;
-        struct pool_features pf;
-        struct dm_arg_set as;
-        struct dm_dev *data_dev;
-        unsigned long block_size;
-        dm_block_t low_water_blocks;
-        struct dm_dev *metadata_dev;
-        sector_t metadata_dev_size;
-        char b[BDEVNAME_SIZE];
-        /*
-         * FIXME Remove validation from scope of lock.
-         */
-        mutex_lock(&dm_thin_pool_table.mutex);
-        if (argc < 4) {
-                ti->error = "Invalid argument count";
-                r = -EINVAL;
-                goto out_unlock;
-        }
-        as.argc = argc;
-        as.argv = argv;
-        r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
-        if (r) {
-                ti->error = "Error opening metadata block device";
-                goto out_unlock;
-        }
-        metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
-        if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
-                DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
-                       bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
-        r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
-        if (r) {
-                ti->error = "Error getting data device";
-                goto out_metadata;
-        }
-        if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
-            block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
-            block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
-            block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
-                ti->error = "Invalid block size";
-                r = -EINVAL;
-                goto out;
-        }
-        if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
-                ti->error = "Invalid low water mark";
-                r = -EINVAL;
-                goto out;
-        }
-        /*
-         * Set default pool features.
-         */
-        pool_features_init(&pf);
-        dm_consume_args(&as, 4);
-        r = parse_pool_features(&as, &pf, ti);
-        if (r)
-                goto out;
-        pt = kzalloc(sizeof(*pt), GFP_KERNEL);
-        if (!pt) {
-                r = -ENOMEM;
-                goto out;
-        }
-        pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
-                           block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
-        if (IS_ERR(pool)) {
-                r = PTR_ERR(pool);
-                goto out_free_pt;
-        }
-        /*
-         * 'pool_created' reflects whether this is the first table load.
-         * Top level discard support is not allowed to be changed after
-         * initial load.  This would require a pool reload to trigger thin
-         * device changes.
-         */
-        if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
-                ti->error = "Discard support cannot be disabled once enabled";
-                r = -EINVAL;
-                goto out_flags_changed;
-        }
-        pt->pool = pool;
-        pt->ti = ti;
-        pt->metadata_dev = metadata_dev;
-        pt->data_dev = data_dev;
-        pt->low_water_blocks = low_water_blocks;
-        pt->adjusted_pf = pt->requested_pf = pf;
-        ti->num_flush_requests = 1;
-        /*
-         * Only need to enable discards if the pool should pass
-         * them down to the data device.  The thin device's discard
-         * processing will cause mappings to be removed from the btree.
-         */
-        if (pf.discard_enabled && pf.discard_passdown) {
-                ti->num_discard_requests = 1;
-                /*
-                 * Setting 'discards_supported' circumvents the normal
-                 * stacking of discard limits (this keeps the pool and
-                 * thin devices' discard limits consistent).
-                 */
-                ti->discards_supported = true;
-                ti->discard_zeroes_data_unsupported = true;
-        }
-        ti->private = pt;
-        pt->callbacks.congested_fn = pool_is_congested;
-        dm_table_add_target_callbacks(ti->table, &pt->callbacks);
-        mutex_unlock(&dm_thin_pool_table.mutex);
-        return 0;
-out_flags_changed:
-        __pool_dec(pool);
-out_free_pt:
-        kfree(pt);
-out:
-        dm_put_device(ti, data_dev);
-out_metadata:
-        dm_put_device(ti, metadata_dev);
-out_unlock:
-        mutex_unlock(&dm_thin_pool_table.mutex);
-        return r;
-}
-static int pool_map(struct dm_target *ti, struct bio *bio)
-{
-        int r;
-        struct pool_c *pt = ti->private;
-        struct pool *pool = pt->pool;
-        unsigned long flags;
-        /*
-         * As this is a singleton target, ti->begin is always zero.
-         */
-        spin_lock_irqsave(&pool->lock, flags);
-        bio->bi_bdev = pt->data_dev->bdev;
-        r = DM_MAPIO_REMAPPED;
-        spin_unlock_irqrestore(&pool->lock, flags);
-        return r;
-}
-/*
- * Retrieves the number of blocks of the data device from
- * the superblock and compares it to the actual device size,
- * thus resizing the data device in case it has grown.
- *
- * This both copes with opening preallocated data devices in the ctr
- * being followed by a resume
- * -and-
- * calling the resume method individually after userspace has
- * grown the data device in reaction to a table event.
- */
-static int pool_preresume(struct dm_target *ti)
-{
-        int r;
-        struct pool_c *pt = ti->private;
-        struct pool *pool = pt->pool;
-        sector_t data_size = ti->len;
-        dm_block_t sb_data_size;
-        /*
-         * Take control of the pool object.
-         */
-        r = bind_control_target(pool, ti);
-        if (r)
-                return r;
-        (void) sector_div(data_size, pool->sectors_per_block);
-        r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
-        if (r) {
-                DMERR("failed to retrieve data device size");
-                return r;
-        }
-        if (data_size < sb_data_size) {
-                DMERR("pool target too small, is %llu blocks (expected %llu)",
-                      (unsigned long long)data_size, sb_data_size);
-                return -EINVAL;
-        } else if (data_size > sb_data_size) {
-                r = dm_pool_resize_data_dev(pool->pmd, data_size);
-                if (r) {
-                        DMERR("failed to resize data device");
-                        /* FIXME Stricter than necessary: Rollback transaction instead here */
-                        set_pool_mode(pool, PM_READ_ONLY);
-                        return r;
-                }
-                (void) commit_or_fallback(pool);
-        }
-        return 0;
-}
-static void pool_resume(struct dm_target *ti)
-{
-        struct pool_c *pt = ti->private;
-        struct pool *pool = pt->pool;
-        unsigned long flags;
-        spin_lock_irqsave(&pool->lock, flags);
-        pool->low_water_triggered = 0;
-        pool->no_free_space = 0;
-        __requeue_bios(pool);
-        spin_unlock_irqrestore(&pool->lock, flags);
-        do_waker(&pool->waker.work);
-}
-static void pool_postsuspend(struct dm_target *ti)
-{
-        struct pool_c *pt = ti->private;
-        struct pool *pool = pt->pool;
-        cancel_delayed_work(&pool->waker);
-        flush_workqueue(pool->wq);
-        (void) commit_or_fallback(pool);
-}
-static int check_arg_count(unsigned argc, unsigned args_required)
-{
-        if (argc != args_required) {
-                DMWARN("Message received with %u arguments instead of %u.",
-                       argc, args_required);
-                return -EINVAL;
-        }
-        return 0;
-}
-static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
-{
-        if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
-            *dev_id <= MAX_DEV_ID)
-                return 0;
-        if (warning)
-                DMWARN("Message received with invalid device id: %s", arg);
-        return -EINVAL;
-}
-static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
-{
-        dm_thin_id dev_id;
-        int r;
-        r = check_arg_count(argc, 2);
-        if (r)
-                return r;
-        r = read_dev_id(argv[1], &dev_id, 1);
-        if (r)
-                return r;
-        r = dm_pool_create_thin(pool->pmd, dev_id);
-        if (r) {
-                DMWARN("Creation of new thinly-provisioned device with id %s failed.",
-                       argv[1]);
-                return r;
-        }
-        return 0;
-}
-static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
-{
-        dm_thin_id dev_id;
-        dm_thin_id origin_dev_id;
-        int r;
-        r = check_arg_count(argc, 3);
-        if (r)
-                return r;
-        r = read_dev_id(argv[1], &dev_id, 1);
-        if (r)
-                return r;
-        r = read_dev_id(argv[2], &origin_dev_id, 1);
-        if (r)
-                return r;
-        r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
-        if (r) {
-                DMWARN("Creation of new snapshot %s of device %s failed.",
-                       argv[1], argv[2]);
-                return r;
-        }
-        return 0;
-}
-static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
-{
-        dm_thin_id dev_id;
-        int r;
-        r = check_arg_count(argc, 2);
-        if (r)
-                return r;
-        r = read_dev_id(argv[1], &dev_id, 1);
-        if (r)
-                return r;
-        r = dm_pool_delete_thin_device(pool->pmd, dev_id);
-        if (r)
-                DMWARN("Deletion of thin device %s failed.", argv[1]);
-        return r;
-}
-static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
-{
-        dm_thin_id old_id, new_id;
-        int r;
-        r = check_arg_count(argc, 3);
-        if (r)
-                return r;
-        if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
-                DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
-                return -EINVAL;
-        }
-        if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
-                DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
-                return -EINVAL;
-        }
-        r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
-        if (r) {
-                DMWARN("Failed to change transaction id from %s to %s.",
-                       argv[1], argv[2]);
-                return r;
-        }
-        return 0;
-}
-static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
-{
-        int r;
-        r = check_arg_count(argc, 1);
-        if (r)
-                return r;
-        (void) commit_or_fallback(pool);
-        r = dm_pool_reserve_metadata_snap(pool->pmd);
-        if (r)
-                DMWARN("reserve_metadata_snap message failed.");
-        return r;
-}
-static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
-{
-        int r;
-        r = check_arg_count(argc, 1);
-        if (r)
-                return r;
-        r = dm_pool_release_metadata_snap(pool->pmd);
-        if (r)
-                DMWARN("release_metadata_snap message failed.");
-        return r;
-}
-/*
- * Messages supported:
- *   create_thin        <dev_id>
- *   create_snap        <dev_id> <origin_id>
- *   delete             <dev_id>
- *   trim               <dev_id> <new_size_in_sectors>
- *   set_transaction_id <current_trans_id> <new_trans_id>
- *   reserve_metadata_snap
- *   release_metadata_snap
- */
-static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
-{
-        int r = -EINVAL;
-        struct pool_c *pt = ti->private;
-        struct pool *pool = pt->pool;
-        if (!strcasecmp(argv[0], "create_thin"))
-                r = process_create_thin_mesg(argc, argv, pool);
-        else if (!strcasecmp(argv[0], "create_snap"))
-                r = process_create_snap_mesg(argc, argv, pool);
-        else if (!strcasecmp(argv[0], "delete"))
-                r = process_delete_mesg(argc, argv, pool);
-        else if (!strcasecmp(argv[0], "set_transaction_id"))
-                r = process_set_transaction_id_mesg(argc, argv, pool);
-        else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
-                r = process_reserve_metadata_snap_mesg(argc, argv, pool);
-        else if (!strcasecmp(argv[0], "release_metadata_snap"))
-                r = process_release_metadata_snap_mesg(argc, argv, pool);
-        else
-                DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
-        if (!r)
-                (void) commit_or_fallback(pool);
-        return r;
-}
-static void emit_flags(struct pool_features *pf, char *result,
-                       unsigned sz, unsigned maxlen)
-{
-        unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
-                !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
-        DMEMIT("%u ", count);
-        if (!pf->zero_new_blocks)
-                DMEMIT("skip_block_zeroing ");
-        if (!pf->discard_enabled)
-                DMEMIT("ignore_discard ");
-        if (!pf->discard_passdown)
-                DMEMIT("no_discard_passdown ");
-        if (pf->mode == PM_READ_ONLY)
-                DMEMIT("read_only ");
-}
-/*
- * Status line is:
- *    <transaction id> <used metadata sectors>/<total metadata sectors>
- *    <used data sectors>/<total data sectors> <held metadata root>
- */
-static int pool_status(struct dm_target *ti, status_type_t type,
-                       unsigned status_flags, char *result, unsigned maxlen)
-{
-        int r;
-        unsigned sz = 0;
-        uint64_t transaction_id;
-        dm_block_t nr_free_blocks_data;
-        dm_block_t nr_free_blocks_metadata;
-        dm_block_t nr_blocks_data;
-        dm_block_t nr_blocks_metadata;
-        dm_block_t held_root;
-        char buf[BDEVNAME_SIZE];
-        char buf2[BDEVNAME_SIZE];
-        struct pool_c *pt = ti->private;
-        struct pool *pool = pt->pool;
-        switch (type) {
-        case STATUSTYPE_INFO:
-                if (get_pool_mode(pool) == PM_FAIL) {
-                        DMEMIT("Fail");
-                        break;
-                }
-                /* Commit to ensure statistics aren't out-of-date */
-                if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
-                        (void) commit_or_fallback(pool);
-                r = dm_pool_get_metadata_transaction_id(pool->pmd,
-                                                        &transaction_id);
-                if (r)
-                        return r;
-                r = dm_pool_get_free_metadata_block_count(pool->pmd,
-                                                          &nr_free_blocks_metadata);
-                if (r)
-                        return r;
-                r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
-                if (r)
-                        return r;
-                r = dm_pool_get_free_block_count(pool->pmd,
-                                                 &nr_free_blocks_data);
-                if (r)
-                        return r;
-                r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
-                if (r)
-                        return r;
-                r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
-                if (r)
-                        return r;
-                DMEMIT("%llu %llu/%llu %llu/%llu ",
-                       (unsigned long long)transaction_id,
-                       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
-                       (unsigned long long)nr_blocks_metadata,
-                       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
-                       (unsigned long long)nr_blocks_data);
-                if (held_root)
-                        DMEMIT("%llu ", held_root);
-                else
-                        DMEMIT("- ");
-                if (pool->pf.mode == PM_READ_ONLY)
-                        DMEMIT("ro ");
-                else
-                        DMEMIT("rw ");
-                if (!pool->pf.discard_enabled)
-                        DMEMIT("ignore_discard");
-                else if (pool->pf.discard_passdown)
-                        DMEMIT("discard_passdown");
-                else
-                        DMEMIT("no_discard_passdown");
-                break;
-        case STATUSTYPE_TABLE:
-                DMEMIT("%s %s %lu %llu ",
-                       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
-                       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
-                       (unsigned long)pool->sectors_per_block,
-                       (unsigned long long)pt->low_water_blocks);
-                emit_flags(&pt->requested_pf, result, sz, maxlen);
-                break;
-        }
-        return 0;
-}
-static int pool_iterate_devices(struct dm_target *ti,
-                                iterate_devices_callout_fn fn, void *data)
-{
-        struct pool_c *pt = ti->private;
-        return fn(ti, pt->data_dev, 0, ti->len, data);
-}
-static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-                      struct bio_vec *biovec, int max_size)
-{
-        struct pool_c *pt = ti->private;
-        struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
-        if (!q->merge_bvec_fn)
-                return max_size;
-        bvm->bi_bdev = pt->data_dev->bdev;
-        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-static bool block_size_is_power_of_two(struct pool *pool)
-{
-        return pool->sectors_per_block_shift >= 0;
-}
-static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
-{
-        struct pool *pool = pt->pool;
-        struct queue_limits *data_limits;
-        limits->max_discard_sectors = pool->sectors_per_block;
-        /*
-         * discard_granularity is just a hint, and not enforced.
-         */
-        if (pt->adjusted_pf.discard_passdown) {
-                data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
-                limits->discard_granularity = data_limits->discard_granularity;
-        } else if (block_size_is_power_of_two(pool))
-                limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-        else
-                /*
-                 * Use largest power of 2 that is a factor of sectors_per_block
-                 * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS.
-                 */
-                limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1),
-                                                  DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT;
-}
-static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
-{
-        struct pool_c *pt = ti->private;
-        struct pool *pool = pt->pool;
-        blk_limits_io_min(limits, 0);
-        blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
-        /*
-         * pt->adjusted_pf is a staging area for the actual features to use.
-         * They get transferred to the live pool in bind_control_target()
-         * called from pool_preresume().
-         */
-        if (!pt->adjusted_pf.discard_enabled)
-                return;
-        disable_passdown_if_not_supported(pt);
-        set_discard_limits(pt, limits);
-}
-static struct target_type pool_target = {
-        .name = "thin-pool",
-        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
-                    DM_TARGET_IMMUTABLE,
-        .version = {1, 6, 0},
-        .module = THIS_MODULE,
-        .ctr = pool_ctr,
-        .dtr = pool_dtr,
-        .map = pool_map,
-        .postsuspend = pool_postsuspend,
-        .preresume = pool_preresume,
-        .resume = pool_resume,
-        .message = pool_message,
-        .status = pool_status,
-        .merge = pool_merge,
-        .iterate_devices = pool_iterate_devices,
-        .io_hints = pool_io_hints,
-};
-/*----------------------------------------------------------------
- * Thin target methods
- *--------------------------------------------------------------*/
-static void thin_dtr(struct dm_target *ti)
-{
-        struct thin_c *tc = ti->private;
-        mutex_lock(&dm_thin_pool_table.mutex);
-        __pool_dec(tc->pool);
-        dm_pool_close_thin_device(tc->td);
-        dm_put_device(ti, tc->pool_dev);
-        if (tc->origin_dev)
-                dm_put_device(ti, tc->origin_dev);
-        kfree(tc);
-        mutex_unlock(&dm_thin_pool_table.mutex);
-}
-/*
- * Thin target parameters:
- *
- * <pool_dev> <dev_id> [origin_dev]
- *
- * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
- * dev_id: the internal device identifier
- * origin_dev: a device external to the pool that should act as the origin
- *
- * If the pool device has discards disabled, they get disabled for the thin
- * device as well.
- */
-static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
-{
-        int r;
-        struct thin_c *tc;
-        struct dm_dev *pool_dev, *origin_dev;
-        struct mapped_device *pool_md;
-        mutex_lock(&dm_thin_pool_table.mutex);
-        if (argc != 2 && argc != 3) {
-                ti->error = "Invalid argument count";
-                r = -EINVAL;
-                goto out_unlock;
-        }
-        tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
-        if (!tc) {
-                ti->error = "Out of memory";
-                r = -ENOMEM;
-                goto out_unlock;
-        }
-        if (argc == 3) {
-                r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
-                if (r) {
-                        ti->error = "Error opening origin device";
-                        goto bad_origin_dev;
-                }
-                tc->origin_dev = origin_dev;
-        }
-        r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
-        if (r) {
-                ti->error = "Error opening pool device";
-                goto bad_pool_dev;
-        }
-        tc->pool_dev = pool_dev;
-        if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
-                ti->error = "Invalid device id";
-                r = -EINVAL;
-                goto bad_common;
-        }
-        pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
-        if (!pool_md) {
-                ti->error = "Couldn't get pool mapped device";
-                r = -EINVAL;
-                goto bad_common;
-        }
-        tc->pool = __pool_table_lookup(pool_md);
-        if (!tc->pool) {
-                ti->error = "Couldn't find pool object";
-                r = -EINVAL;
-                goto bad_pool_lookup;
-        }
-        __pool_inc(tc->pool);
-        if (get_pool_mode(tc->pool) == PM_FAIL) {
-                ti->error = "Couldn't open thin device, Pool is in fail mode";
-                goto bad_thin_open;
-        }
-        r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
-        if (r) {
-                ti->error = "Couldn't open thin internal device";
-                goto bad_thin_open;
-        }
-        r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
-        if (r)
-                goto bad_thin_open;
-        ti->num_flush_requests = 1;
-        ti->flush_supported = true;
-        ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
-        /* In case the pool supports discards, pass them on. */
-        if (tc->pool->pf.discard_enabled) {
-                ti->discards_supported = true;
-                ti->num_discard_requests = 1;
-                ti->discard_zeroes_data_unsupported = true;
-                /* Discard requests must be split on a block boundary */
-                ti->split_discard_requests = true;
-        }
-        dm_put(pool_md);
-        mutex_unlock(&dm_thin_pool_table.mutex);
-        return 0;
-bad_thin_open:
-        __pool_dec(tc->pool);
-bad_pool_lookup:
-        dm_put(pool_md);
-bad_common:
-        dm_put_device(ti, tc->pool_dev);
-bad_pool_dev:
-        if (tc->origin_dev)
-                dm_put_device(ti, tc->origin_dev);
-bad_origin_dev:
-        kfree(tc);
-out_unlock:
-        mutex_unlock(&dm_thin_pool_table.mutex);
-        return r;
-}
-static int thin_map(struct dm_target *ti, struct bio *bio)
-{
-        bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
-        return thin_bio_map(ti, bio);
-}
-static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
-{
-        unsigned long flags;
-        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-        struct list_head work;
-        struct dm_thin_new_mapping *m, *tmp;
-        struct pool *pool = h->tc->pool;
-        if (h->shared_read_entry) {
-                INIT_LIST_HEAD(&work);
-                dm_deferred_entry_dec(h->shared_read_entry, &work);
-                spin_lock_irqsave(&pool->lock, flags);
-                list_for_each_entry_safe(m, tmp, &work, list) {
-                        list_del(&m->list);
-                        m->quiesced = 1;
-                        __maybe_add_mapping(m);
-                }
-                spin_unlock_irqrestore(&pool->lock, flags);
-        }
-        if (h->all_io_entry) {
-                INIT_LIST_HEAD(&work);
-                dm_deferred_entry_dec(h->all_io_entry, &work);
-                if (!list_empty(&work)) {
-                        spin_lock_irqsave(&pool->lock, flags);
-                        list_for_each_entry_safe(m, tmp, &work, list)
-                                list_add(&m->list, &pool->prepared_discards);
-                        spin_unlock_irqrestore(&pool->lock, flags);
-                        wake_worker(pool);
-                }
-        }
-        return 0;
-}
-static void thin_postsuspend(struct dm_target *ti)
-{
-        if (dm_noflush_suspending(ti))
-                requeue_io((struct thin_c *)ti->private);
-}
-/*
- * <nr mapped sectors> <highest mapped sector>
- */
-static int thin_status(struct dm_target *ti, status_type_t type,
-                       unsigned status_flags, char *result, unsigned maxlen)
-{
-        int r;
-        ssize_t sz = 0;
-        dm_block_t mapped, highest;
-        char buf[BDEVNAME_SIZE];
-        struct thin_c *tc = ti->private;
-        if (get_pool_mode(tc->pool) == PM_FAIL) {
-                DMEMIT("Fail");
-                return 0;
-        }
-        if (!tc->td)
-                DMEMIT("-");
-        else {
-                switch (type) {
-                case STATUSTYPE_INFO:
-                        r = dm_thin_get_mapped_count(tc->td, &mapped);
-                        if (r)
-                                return r;
-                        r = dm_thin_get_highest_mapped_block(tc->td, &highest);
-                        if (r < 0)
-                                return r;
-                        DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
-                        if (r)
-                                DMEMIT("%llu", ((highest + 1) *
-                                                tc->pool->sectors_per_block) - 1);
-                        else
-                                DMEMIT("-");
-                        break;
-                case STATUSTYPE_TABLE:
-                        DMEMIT("%s %lu",
-                               format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
-                               (unsigned long) tc->dev_id);
-                        if (tc->origin_dev)
-                                DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
-                        break;
-                }
-        }
-        return 0;
-}
-static int thin_iterate_devices(struct dm_target *ti,
-                                iterate_devices_callout_fn fn, void *data)
-{
-        sector_t blocks;
-        struct thin_c *tc = ti->private;
-        struct pool *pool = tc->pool;
-        /*
-         * We can't call dm_pool_get_data_dev_size() since that blocks.  So
-         * we follow a more convoluted path through to the pool's target.
-         */
-        if (!pool->ti)
-                return 0;       /* nothing is bound */
-        blocks = pool->ti->len;
-        (void) sector_div(blocks, pool->sectors_per_block);
-        if (blocks)
-                return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
-        return 0;
-}
-/*
- * A thin device always inherits its queue limits from its pool.
- */
-static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
-{
-        struct thin_c *tc = ti->private;
-        *limits = bdev_get_queue(tc->pool_dev->bdev)->limits;
-}
-static struct target_type thin_target = {
-        .name = "thin",
-        .version = {1, 6, 0},
-        .module = THIS_MODULE,
-        .ctr = thin_ctr,
-        .dtr = thin_dtr,
-        .map = thin_map,
-        .end_io = thin_endio,
-        .postsuspend = thin_postsuspend,
-        .status = thin_status,
-        .iterate_devices = thin_iterate_devices,
-        .io_hints = thin_io_hints,
-};
-/*----------------------------------------------------------------*/
-static int __init dm_thin_init(void)
-{
-        int r;
-        pool_table_init();
-        r = dm_register_target(&thin_target);
-        if (r)
-                return r;
-        r = dm_register_target(&pool_target);
-        if (r)
-                goto bad_pool_target;
-        r = -ENOMEM;
-        _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
-        if (!_new_mapping_cache)
-                goto bad_new_mapping_cache;
-        return 0;
-bad_new_mapping_cache:
-        dm_unregister_target(&pool_target);
-bad_pool_target:
-        dm_unregister_target(&thin_target);
-        return r;
-}
-static void dm_thin_exit(void)
-{
-        dm_unregister_target(&thin_target);
-        dm_unregister_target(&pool_target);
-        kmem_cache_destroy(_new_mapping_cache);
-}
-module_init(dm_thin_init);
-module_exit(dm_thin_exit);
-MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
-MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c
index 8efe033bab5..6b1e3b61b25 100644
--- a/drivers/md/dm-uevent.c
+++ b/drivers/md/dm-uevent.c
@@ -22,7 +22,6 @@
 #include <linux/slab.h>
 #include <linux/kobject.h>
 #include <linux/dm-ioctl.h>
-#include <linux/export.h>
 #include "dm.h"
 #include "dm-uevent.h"
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
deleted file mode 100644
index 52cde982164..00000000000
--- a/drivers/md/dm-verity.c
+++ /dev/null
@@ -1,898 +0,0 @@
-/*
- * Copyright (C) 2012 Red Hat, Inc.
- *
- * Author: Mikulas Patocka <mpatocka@redhat.com>
- *
- * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
- *
- * This file is released under the GPLv2.
- *
- * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set
- * default prefetch value. Data are read in "prefetch_cluster" chunks from the
- * hash device. Setting this greatly improves performance when data and hash
- * are on the same disk on different partitions on devices with poor random
- * access behavior.
- */
-#include "dm-bufio.h"
-#include <linux/module.h>
-#include <linux/device-mapper.h>
-#include <crypto/hash.h>
-#define DM_MSG_PREFIX                   "verity"
-#define DM_VERITY_IO_VEC_INLINE         16
-#define DM_VERITY_MEMPOOL_SIZE          4
-#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
-#define DM_VERITY_MAX_LEVELS            63
-static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
-module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
-struct dm_verity {
-        struct dm_dev *data_dev;
-        struct dm_dev *hash_dev;
-        struct dm_target *ti;
-        struct dm_bufio_client *bufio;
-        char *alg_name;
-        struct crypto_shash *tfm;
-        u8 *root_digest;        /* digest of the root block */
-        u8 *salt;               /* salt: its size is salt_size */
-        unsigned salt_size;
-        sector_t data_start;    /* data offset in 512-byte sectors */
-        sector_t hash_start;    /* hash start in blocks */
-        sector_t data_blocks;   /* the number of data blocks */
-        sector_t hash_blocks;   /* the number of hash blocks */
-        unsigned char data_dev_block_bits;      /* log2(data blocksize) */
-        unsigned char hash_dev_block_bits;      /* log2(hash blocksize) */
-        unsigned char hash_per_block_bits;      /* log2(hashes in hash block) */
-        unsigned char levels;   /* the number of tree levels */
-        unsigned char version;
-        unsigned digest_size;   /* digest size for the current hash algorithm */
-        unsigned shash_descsize;/* the size of temporary space for crypto */
-        int hash_failed;        /* set to 1 if hash of any block failed */
-        mempool_t *vec_mempool; /* mempool of bio vector */
-        struct workqueue_struct *verify_wq;
-        /* starting blocks for each tree level. 0 is the lowest level. */
-        sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
-};
-struct dm_verity_io {
-        struct dm_verity *v;
-        /* original values of bio->bi_end_io and bio->bi_private */
-        bio_end_io_t *orig_bi_end_io;
-        void *orig_bi_private;
-        sector_t block;
-        unsigned n_blocks;
-        /* saved bio vector */
-        struct bio_vec *io_vec;
-        unsigned io_vec_size;
-        struct work_struct work;
-        /* A space for short vectors; longer vectors are allocated separately. */
-        struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
-        /*
-         * Three variably-size fields follow this struct:
-         *
-         * u8 hash_desc[v->shash_descsize];
-         * u8 real_digest[v->digest_size];
-         * u8 want_digest[v->digest_size];
-         *
-         * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
-         */
-};
-static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
-{
-        return (struct shash_desc *)(io + 1);
-}
-static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
-{
-        return (u8 *)(io + 1) + v->shash_descsize;
-}
-static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
-{
-        return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
-}
-/*
- * Auxiliary structure appended to each dm-bufio buffer. If the value
- * hash_verified is nonzero, hash of the block has been verified.
- *
- * The variable hash_verified is set to 0 when allocating the buffer, then
- * it can be changed to 1 and it is never reset to 0 again.
- *
- * There is no lock around this value, a race condition can at worst cause
- * that multiple processes verify the hash of the same buffer simultaneously
- * and write 1 to hash_verified simultaneously.
- * This condition is harmless, so we don't need locking.
- */
-struct buffer_aux {
-        int hash_verified;
-};
-/*
- * Initialize struct buffer_aux for a freshly created buffer.
- */
-static void dm_bufio_alloc_callback(struct dm_buffer *buf)
-{
-        struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
-        aux->hash_verified = 0;
-}
-/*
- * Translate input sector number to the sector number on the target device.
- */
-static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector)
-{
-        return v->data_start + dm_target_offset(v->ti, bi_sector);
-}
-/*
- * Return hash position of a specified block at a specified tree level
- * (0 is the lowest level).
- * The lowest "hash_per_block_bits"-bits of the result denote hash position
- * inside a hash block. The remaining bits denote location of the hash block.
- */
-static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
-                                         int level)
-{
-        return block >> (level * v->hash_per_block_bits);
-}
-static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
-                                 sector_t *hash_block, unsigned *offset)
-{
-        sector_t position = verity_position_at_level(v, block, level);
-        unsigned idx;
-        *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits);
-        if (!offset)
-                return;
-        idx = position & ((1 << v->hash_per_block_bits) - 1);
-        if (!v->version)
-                *offset = idx * v->digest_size;
-        else
-                *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
-}
-/*
- * Verify hash of a metadata block pertaining to the specified data block
- * ("block" argument) at a specified level ("level" argument).
- *
- * On successful return, io_want_digest(v, io) contains the hash value for
- * a lower tree level or for the data block (if we're at the lowest leve).
- *
- * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
- * If "skip_unverified" is false, unverified buffer is hashed and verified
- * against current value of io_want_digest(v, io).
- */
-static int verity_verify_level(struct dm_verity_io *io, sector_t block,
-                               int level, bool skip_unverified)
-{
-        struct dm_verity *v = io->v;
-        struct dm_buffer *buf;
-        struct buffer_aux *aux;
-        u8 *data;
-        int r;
-        sector_t hash_block;
-        unsigned offset;
-        verity_hash_at_level(v, block, level, &hash_block, &offset);
-        data = dm_bufio_read(v->bufio, hash_block, &buf);
-        if (unlikely(IS_ERR(data)))
-                return PTR_ERR(data);
-        aux = dm_bufio_get_aux_data(buf);
-        if (!aux->hash_verified) {
-                struct shash_desc *desc;
-                u8 *result;
-                if (skip_unverified) {
-                        r = 1;
-                        goto release_ret_r;
-                }
-                desc = io_hash_desc(v, io);
-                desc->tfm = v->tfm;
-                desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-                r = crypto_shash_init(desc);
-                if (r < 0) {
-                        DMERR("crypto_shash_init failed: %d", r);
-                        goto release_ret_r;
-                }
-                if (likely(v->version >= 1)) {
-                        r = crypto_shash_update(desc, v->salt, v->salt_size);
-                        if (r < 0) {
-                                DMERR("crypto_shash_update failed: %d", r);
-                                goto release_ret_r;
-                        }
-                }
-                r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
-                if (r < 0) {
-                        DMERR("crypto_shash_update failed: %d", r);
-                        goto release_ret_r;
-                }
-                if (!v->version) {
-                        r = crypto_shash_update(desc, v->salt, v->salt_size);
-                        if (r < 0) {
-                                DMERR("crypto_shash_update failed: %d", r);
-                                goto release_ret_r;
-                        }
-                }
-                result = io_real_digest(v, io);
-                r = crypto_shash_final(desc, result);
-                if (r < 0) {
-                        DMERR("crypto_shash_final failed: %d", r);
-                        goto release_ret_r;
-                }
-                if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-                        DMERR_LIMIT("metadata block %llu is corrupted",
-                                (unsigned long long)hash_block);
-                        v->hash_failed = 1;
-                        r = -EIO;
-                        goto release_ret_r;
-                } else
-                        aux->hash_verified = 1;
-        }
-        data += offset;
-        memcpy(io_want_digest(v, io), data, v->digest_size);
-        dm_bufio_release(buf);
-        return 0;
-release_ret_r:
-        dm_bufio_release(buf);
-        return r;
-}
-/*
- * Verify one "dm_verity_io" structure.
- */
-static int verity_verify_io(struct dm_verity_io *io)
-{
-        struct dm_verity *v = io->v;
-        unsigned b;
-        int i;
-        unsigned vector = 0, offset = 0;
-        for (b = 0; b < io->n_blocks; b++) {
-                struct shash_desc *desc;
-                u8 *result;
-                int r;
-                unsigned todo;
-                if (likely(v->levels)) {
-                        /*
-                         * First, we try to get the requested hash for
-                         * the current block. If the hash block itself is
-                         * verified, zero is returned. If it isn't, this
-                         * function returns 0 and we fall back to whole
-                         * chain verification.
-                         */
-                        int r = verity_verify_level(io, io->block + b, 0, true);
-                        if (likely(!r))
-                                goto test_block_hash;
-                        if (r < 0)
-                                return r;
-                }
-                memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
-                for (i = v->levels - 1; i >= 0; i--) {
-                        int r = verity_verify_level(io, io->block + b, i, false);
-                        if (unlikely(r))
-                                return r;
-                }
-test_block_hash:
-                desc = io_hash_desc(v, io);
-                desc->tfm = v->tfm;
-                desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-                r = crypto_shash_init(desc);
-                if (r < 0) {
-                        DMERR("crypto_shash_init failed: %d", r);
-                        return r;
-                }
-                if (likely(v->version >= 1)) {
-                        r = crypto_shash_update(desc, v->salt, v->salt_size);
-                        if (r < 0) {
-                                DMERR("crypto_shash_update failed: %d", r);
-                                return r;
-                        }
-                }
-                todo = 1 << v->data_dev_block_bits;
-                do {
-                        struct bio_vec *bv;
-                        u8 *page;
-                        unsigned len;
-                        BUG_ON(vector >= io->io_vec_size);
-                        bv = &io->io_vec[vector];
-                        page = kmap_atomic(bv->bv_page);
-                        len = bv->bv_len - offset;
-                        if (likely(len >= todo))
-                                len = todo;
-                        r = crypto_shash_update(desc,
-                                        page + bv->bv_offset + offset, len);
-                        kunmap_atomic(page);
-                        if (r < 0) {
-                                DMERR("crypto_shash_update failed: %d", r);
-                                return r;
-                        }
-                        offset += len;
-                        if (likely(offset == bv->bv_len)) {
-                                offset = 0;
-                                vector++;
-                        }
-                        todo -= len;
-                } while (todo);
-                if (!v->version) {
-                        r = crypto_shash_update(desc, v->salt, v->salt_size);
-                        if (r < 0) {
-                                DMERR("crypto_shash_update failed: %d", r);
-                                return r;
-                        }
-                }
-                result = io_real_digest(v, io);
-                r = crypto_shash_final(desc, result);
-                if (r < 0) {
-                        DMERR("crypto_shash_final failed: %d", r);
-                        return r;
-                }
-                if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-                        DMERR_LIMIT("data block %llu is corrupted",
-                                (unsigned long long)(io->block + b));
-                        v->hash_failed = 1;
-                        return -EIO;
-                }
-        }
-        BUG_ON(vector != io->io_vec_size);
-        BUG_ON(offset);
-        return 0;
-}
-/*
- * End one "io" structure with a given error.
- */
-static void verity_finish_io(struct dm_verity_io *io, int error)
-{
-        struct dm_verity *v = io->v;
-        struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
-        bio->bi_end_io = io->orig_bi_end_io;
-        bio->bi_private = io->orig_bi_private;
-        if (io->io_vec != io->io_vec_inline)
-                mempool_free(io->io_vec, v->vec_mempool);
-        bio_endio(bio, error);
-}
-static void verity_work(struct work_struct *w)
-{
-        struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
-        verity_finish_io(io, verity_verify_io(io));
-}
-static void verity_end_io(struct bio *bio, int error)
-{
-        struct dm_verity_io *io = bio->bi_private;
-        if (error) {
-                verity_finish_io(io, error);
-                return;
-        }
-        INIT_WORK(&io->work, verity_work);
-        queue_work(io->v->verify_wq, &io->work);
-}
-/*
- * Prefetch buffers for the specified io.
- * The root buffer is not prefetched, it is assumed that it will be cached
- * all the time.
- */
-static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
-{
-        int i;
-        for (i = v->levels - 2; i >= 0; i--) {
-                sector_t hash_block_start;
-                sector_t hash_block_end;
-                verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
-                verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
-                if (!i) {
-                        unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster);
-                        cluster >>= v->data_dev_block_bits;
-                        if (unlikely(!cluster))
-                                goto no_prefetch_cluster;
-                        if (unlikely(cluster & (cluster - 1)))
-                                cluster = 1 << (fls(cluster) - 1);
-                        hash_block_start &= ~(sector_t)(cluster - 1);
-                        hash_block_end |= cluster - 1;
-                        if (unlikely(hash_block_end >= v->hash_blocks))
-                                hash_block_end = v->hash_blocks - 1;
-                }
-no_prefetch_cluster:
-                dm_bufio_prefetch(v->bufio, hash_block_start,
-                                  hash_block_end - hash_block_start + 1);
-        }
-}
-/*
- * Bio map function. It allocates dm_verity_io structure and bio vector and
- * fills them. Then it issues prefetches and the I/O.
- */
-static int verity_map(struct dm_target *ti, struct bio *bio)
-{
-        struct dm_verity *v = ti->private;
-        struct dm_verity_io *io;
-        bio->bi_bdev = v->data_dev->bdev;
-        bio->bi_sector = verity_map_sector(v, bio->bi_sector);
-        if (((unsigned)bio->bi_sector | bio_sectors(bio)) &
-            ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
-                DMERR_LIMIT("unaligned io");
-                return -EIO;
-        }
-        if ((bio->bi_sector + bio_sectors(bio)) >>
-            (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
-                DMERR_LIMIT("io out of range");
-                return -EIO;
-        }
-        if (bio_data_dir(bio) == WRITE)
-                return -EIO;
-        io = dm_per_bio_data(bio, ti->per_bio_data_size);
-        io->v = v;
-        io->orig_bi_end_io = bio->bi_end_io;
-        io->orig_bi_private = bio->bi_private;
-        io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
-        io->n_blocks = bio->bi_size >> v->data_dev_block_bits;
-        bio->bi_end_io = verity_end_io;
-        bio->bi_private = io;
-        io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
-        if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
-                io->io_vec = io->io_vec_inline;
-        else
-                io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
-        memcpy(io->io_vec, bio_iovec(bio),
-               io->io_vec_size * sizeof(struct bio_vec));
-        verity_prefetch_io(v, io);
-        generic_make_request(bio);
-        return DM_MAPIO_SUBMITTED;
-}
-/*
- * Status: V (valid) or C (corruption found)
- */
-static int verity_status(struct dm_target *ti, status_type_t type,
-                         unsigned status_flags, char *result, unsigned maxlen)
-{
-        struct dm_verity *v = ti->private;
-        unsigned sz = 0;
-        unsigned x;
-        switch (type) {
-        case STATUSTYPE_INFO:
-                DMEMIT("%c", v->hash_failed ? 'C' : 'V');
-                break;
-        case STATUSTYPE_TABLE:
-                DMEMIT("%u %s %s %u %u %llu %llu %s ",
-                        v->version,
-                        v->data_dev->name,
-                        v->hash_dev->name,
-                        1 << v->data_dev_block_bits,
-                        1 << v->hash_dev_block_bits,
-                        (unsigned long long)v->data_blocks,
-                        (unsigned long long)v->hash_start,
-                        v->alg_name
-                        );
-                for (x = 0; x < v->digest_size; x++)
-                        DMEMIT("%02x", v->root_digest[x]);
-                DMEMIT(" ");
-                if (!v->salt_size)
-                        DMEMIT("-");
-                else
-                        for (x = 0; x < v->salt_size; x++)
-                                DMEMIT("%02x", v->salt[x]);
-                break;
-        }
-        return 0;
-}
-static int verity_ioctl(struct dm_target *ti, unsigned cmd,
-                        unsigned long arg)
-{
-        struct dm_verity *v = ti->private;
-        int r = 0;
-        if (v->data_start ||
-            ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
-                r = scsi_verify_blk_ioctl(NULL, cmd);
-        return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
-                                     cmd, arg);
-}
-static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-                        struct bio_vec *biovec, int max_size)
-{
-        struct dm_verity *v = ti->private;
-        struct request_queue *q = bdev_get_queue(v->data_dev->bdev);
-        if (!q->merge_bvec_fn)
-                return max_size;
-        bvm->bi_bdev = v->data_dev->bdev;
-        bvm->bi_sector = verity_map_sector(v, bvm->bi_sector);
-        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-static int verity_iterate_devices(struct dm_target *ti,
-                                  iterate_devices_callout_fn fn, void *data)
-{
-        struct dm_verity *v = ti->private;
-        return fn(ti, v->data_dev, v->data_start, ti->len, data);
-}
-static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
-{
-        struct dm_verity *v = ti->private;
-        if (limits->logical_block_size < 1 << v->data_dev_block_bits)
-                limits->logical_block_size = 1 << v->data_dev_block_bits;
-        if (limits->physical_block_size < 1 << v->data_dev_block_bits)
-                limits->physical_block_size = 1 << v->data_dev_block_bits;
-        blk_limits_io_min(limits, limits->logical_block_size);
-}
-static void verity_dtr(struct dm_target *ti)
-{
-        struct dm_verity *v = ti->private;
-        if (v->verify_wq)
-                destroy_workqueue(v->verify_wq);
-        if (v->vec_mempool)
-                mempool_destroy(v->vec_mempool);
-        if (v->bufio)
-                dm_bufio_client_destroy(v->bufio);
-        kfree(v->salt);
-        kfree(v->root_digest);
-        if (v->tfm)
-                crypto_free_shash(v->tfm);
-        kfree(v->alg_name);
-        if (v->hash_dev)
-                dm_put_device(ti, v->hash_dev);
-        if (v->data_dev)
-                dm_put_device(ti, v->data_dev);
-        kfree(v);
-}
-/*
- * Target parameters:
- *      <version>       The current format is version 1.
- *                      Vsn 0 is compatible with original Chromium OS releases.
- *      <data device>
- *      <hash device>
- *      <data block size>
- *      <hash block size>
- *      <the number of data blocks>
- *      <hash start block>
- *      <algorithm>
- *      <digest>
- *      <salt>          Hex string or "-" if no salt.
- */
-static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
-{
-        struct dm_verity *v;
-        unsigned num;
-        unsigned long long num_ll;
-        int r;
-        int i;
-        sector_t hash_position;
-        char dummy;
-        v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
-        if (!v) {
-                ti->error = "Cannot allocate verity structure";
-                return -ENOMEM;
-        }
-        ti->private = v;
-        v->ti = ti;
-        if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
-                ti->error = "Device must be readonly";
-                r = -EINVAL;
-                goto bad;
-        }
-        if (argc != 10) {
-                ti->error = "Invalid argument count: exactly 10 arguments required";
-                r = -EINVAL;
-                goto bad;
-        }
-        if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
-            num < 0 || num > 1) {
-                ti->error = "Invalid version";
-                r = -EINVAL;
-                goto bad;
-        }
-        v->version = num;
-        r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
-        if (r) {
-                ti->error = "Data device lookup failed";
-                goto bad;
-        }
-        r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
-        if (r) {
-                ti->error = "Data device lookup failed";
-                goto bad;
-        }
-        if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 ||
-            !num || (num & (num - 1)) ||
-            num < bdev_logical_block_size(v->data_dev->bdev) ||
-            num > PAGE_SIZE) {
-                ti->error = "Invalid data device block size";
-                r = -EINVAL;
-                goto bad;
-        }
-        v->data_dev_block_bits = ffs(num) - 1;
-        if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
-            !num || (num & (num - 1)) ||
-            num < bdev_logical_block_size(v->hash_dev->bdev) ||
-            num > INT_MAX) {
-                ti->error = "Invalid hash device block size";
-                r = -EINVAL;
-                goto bad;
-        }
-        v->hash_dev_block_bits = ffs(num) - 1;
-        if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
-            (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
-            >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll) {
-                ti->error = "Invalid data blocks";
-                r = -EINVAL;
-                goto bad;
-        }
-        v->data_blocks = num_ll;
-        if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) {
-                ti->error = "Data device is too small";
-                r = -EINVAL;
-                goto bad;
-        }
-        if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
-            (sector_t)(num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT))
-            >> (v->hash_dev_block_bits - SECTOR_SHIFT) != num_ll) {
-                ti->error = "Invalid hash start";
-                r = -EINVAL;
-                goto bad;
-        }
-        v->hash_start = num_ll;
-        v->alg_name = kstrdup(argv[7], GFP_KERNEL);
-        if (!v->alg_name) {
-                ti->error = "Cannot allocate algorithm name";
-                r = -ENOMEM;
-                goto bad;
-        }
-        v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
-        if (IS_ERR(v->tfm)) {
-                ti->error = "Cannot initialize hash function";
-                r = PTR_ERR(v->tfm);
-                v->tfm = NULL;
-                goto bad;
-        }
-        v->digest_size = crypto_shash_digestsize(v->tfm);
-        if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
-                ti->error = "Digest size too big";
-                r = -EINVAL;
-                goto bad;
-        }
-        v->shash_descsize =
-                sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
-        v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
-        if (!v->root_digest) {
-                ti->error = "Cannot allocate root digest";
-                r = -ENOMEM;
-                goto bad;
-        }
-        if (strlen(argv[8]) != v->digest_size * 2 ||
-            hex2bin(v->root_digest, argv[8], v->digest_size)) {
-                ti->error = "Invalid root digest";
-                r = -EINVAL;
-                goto bad;
-        }
-        if (strcmp(argv[9], "-")) {
-                v->salt_size = strlen(argv[9]) / 2;
-                v->salt = kmalloc(v->salt_size, GFP_KERNEL);
-                if (!v->salt) {
-                        ti->error = "Cannot allocate salt";
-                        r = -ENOMEM;
-                        goto bad;
-                }
-                if (strlen(argv[9]) != v->salt_size * 2 ||
-                    hex2bin(v->salt, argv[9], v->salt_size)) {
-                        ti->error = "Invalid salt";
-                        r = -EINVAL;
-                        goto bad;
-                }
-        }
-        v->hash_per_block_bits =
-                fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1;
-        v->levels = 0;
-        if (v->data_blocks)
-                while (v->hash_per_block_bits * v->levels < 64 &&
-                       (unsigned long long)(v->data_blocks - 1) >>
-                       (v->hash_per_block_bits * v->levels))
-                        v->levels++;
-        if (v->levels > DM_VERITY_MAX_LEVELS) {
-                ti->error = "Too many tree levels";
-                r = -E2BIG;
-                goto bad;
-        }
-        hash_position = v->hash_start;
-        for (i = v->levels - 1; i >= 0; i--) {
-                sector_t s;
-                v->hash_level_block[i] = hash_position;
-                s = verity_position_at_level(v, v->data_blocks, i);
-                s = (s >> v->hash_per_block_bits) +
-                    !!(s & ((1 << v->hash_per_block_bits) - 1));
-                if (hash_position + s < hash_position) {
-                        ti->error = "Hash device offset overflow";
-                        r = -E2BIG;
-                        goto bad;
-                }
-                hash_position += s;
-        }
-        v->hash_blocks = hash_position;
-        v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
-                1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
-                dm_bufio_alloc_callback, NULL);
-        if (IS_ERR(v->bufio)) {
-                ti->error = "Cannot initialize dm-bufio";
-                r = PTR_ERR(v->bufio);
-                v->bufio = NULL;
-                goto bad;
-        }
-        if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) {
-                ti->error = "Hash device is too small";
-                r = -E2BIG;
-                goto bad;
-        }
-        ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io));
-        v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
-                                        BIO_MAX_PAGES * sizeof(struct bio_vec));
-        if (!v->vec_mempool) {
-                ti->error = "Cannot allocate vector mempool";
-                r = -ENOMEM;
-                goto bad;
-        }
-        /* WQ_UNBOUND greatly improves performance when running on ramdisk */
-        v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
-        if (!v->verify_wq) {
-                ti->error = "Cannot allocate workqueue";
-                r = -ENOMEM;
-                goto bad;
-        }
-        return 0;
-bad:
-        verity_dtr(ti);
-        return r;
-}
-static struct target_type verity_target = {
-        .name           = "verity",
-        .version        = {1, 1, 0},
-        .module         = THIS_MODULE,
-        .ctr            = verity_ctr,
-        .dtr            = verity_dtr,
-        .map            = verity_map,
-        .status         = verity_status,
-        .ioctl          = verity_ioctl,
-        .merge          = verity_merge,
-        .iterate_devices = verity_iterate_devices,
-        .io_hints       = verity_io_hints,
-};
-static int __init dm_verity_init(void)
-{
-        int r;
-        r = dm_register_target(&verity_target);
-        if (r < 0)
-                DMERR("register failed %d", r);
-        return r;
-}
-static void __exit dm_verity_exit(void)
-{
-        dm_unregister_target(&verity_target);
-}
-module_init(dm_verity_init);
-module_exit(dm_verity_exit);
-MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
-MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
-MODULE_AUTHOR("Will Drewry <wad@chromium.org>");
-MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking");
-MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 69a5c3b3b34..cc2b3cb8194 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -33,7 +33,8 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 /*
 * Return zeros only on reads
 */
-static int zero_map(struct dm_target *ti, struct bio *bio)
+static int zero_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
 {
        switch(bio_rw(bio)) {
        case READ:
@@ -55,7 +56,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio)
 static struct target_type zero_target = {
        .name   = "zero",
-        .version = {1, 1, 0},
+        .version = {1, 0, 0},
        .module = THIS_MODULE,
        .ctr    = zero_ctr,
        .map    = zero_map,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c72e4d5a961..52b39f335bb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -14,6 +14,7 @@
 #include <linux/moduleparam.h>
 #include <linux/blkpg.h>
 #include <linux/bio.h>
+#include <linux/buffer_head.h>
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
@@ -24,16 +25,6 @@
 #define DM_MSG_PREFIX "core"
-#ifdef CONFIG_PRINTK
-/*
- * ratelimit state to be used in DMXXX_LIMIT().
- */
-DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
-                       DEFAULT_RATELIMIT_INTERVAL,
-                       DEFAULT_RATELIMIT_BURST);
-EXPORT_SYMBOL(dm_ratelimit_state);
-#endif
 /*
 * Cookies are numeric values sent with CHANGE and REMOVE
 * uevents while resuming, removing or renaming the device.
@@ -63,6 +54,17 @@ struct dm_io {
 };
 /*
+ * For bio-based dm.
+ * One of these is allocated per target within a bio.  Hopefully
+ * this will be simplified out one day.
+ */
+struct dm_target_io {
+        struct dm_io *io;
+        struct dm_target *ti;
+        union map_info info;
+};
+/*
 * For request-based dm.
 * One of these is allocated per request.
 */
@@ -75,17 +77,12 @@ struct dm_rq_target_io {
 };
 /*
- * For request-based dm - the bio clones we allocate are embedded in these
+ * For request-based dm.
- * structs.
+ * One of these is allocated per bio.
- *
- * We allocate these with bio_alloc_bioset, using the front_pad parameter when
- * the bioset is created - this means the bio has to come at the end of the
- * struct.
 */
 struct dm_rq_clone_bio_info {
        struct bio *orig;
        struct dm_rq_target_io *tio;
-        struct bio clone;
 };
 union map_info *dm_get_mapinfo(struct bio *bio)
@@ -133,8 +130,6 @@ struct mapped_device {
        /* Protect queue and type against concurrent access. */
        struct mutex type_lock;
-        struct target_type *immutable_target_type;
        struct gendisk *disk;
        char name[16];
@@ -185,6 +180,9 @@ struct mapped_device {
        /* forced geometry settings */
        struct hd_geometry geometry;
+        /* For saving the address of __make_request for request based dm */
+        make_request_fn *saved_make_request_fn;
        /* sysfs handle */
        struct kobject kobj;
@@ -203,12 +201,8 @@ struct dm_md_mempools {
 #define MIN_IOS 256
 static struct kmem_cache *_io_cache;
+static struct kmem_cache *_tio_cache;
 static struct kmem_cache *_rq_tio_cache;
-/*
- * Unused now, and needs to be deleted. But since io_pool is overloaded and it's
- * still used for _io_cache, I'm leaving this for a later cleanup
- */
 static struct kmem_cache *_rq_bio_info_cache;
 static int __init local_init(void)
@@ -220,9 +214,14 @@ static int __init local_init(void)
        if (!_io_cache)
                return r;
+        /* allocate a slab for the target ios */
+        _tio_cache = KMEM_CACHE(dm_target_io, 0);
+        if (!_tio_cache)
+                goto out_free_io_cache;
        _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
        if (!_rq_tio_cache)
-                goto out_free_io_cache;
+                goto out_free_tio_cache;
        _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
        if (!_rq_bio_info_cache)
@@ -248,6 +247,8 @@ out_free_rq_bio_info_cache:
        kmem_cache_destroy(_rq_bio_info_cache);
 out_free_rq_tio_cache:
        kmem_cache_destroy(_rq_tio_cache);
+out_free_tio_cache:
+        kmem_cache_destroy(_tio_cache);
 out_free_io_cache:
        kmem_cache_destroy(_io_cache);
@@ -258,6 +259,7 @@ static void local_exit(void)
 {
        kmem_cache_destroy(_rq_bio_info_cache);
        kmem_cache_destroy(_rq_tio_cache);
+        kmem_cache_destroy(_tio_cache);
        kmem_cache_destroy(_io_cache);
        unregister_blkdev(_major, _name);
        dm_uevent_exit();
@@ -443,7 +445,7 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
 static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 {
-        bio_put(&tio->clone);
+        mempool_free(tio, md->tio_pool);
 }
 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
@@ -457,6 +459,16 @@ static void free_rq_tio(struct dm_rq_target_io *tio)
        mempool_free(tio, tio->md->tio_pool);
 }
+static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
+{
+        return mempool_alloc(md->io_pool, GFP_ATOMIC);
+}
+static void free_bio_info(struct dm_rq_clone_bio_info *info)
+{
+        mempool_free(info, info->tio->md->io_pool);
+}
 static int md_in_flight(struct mapped_device *md)
 {
        return atomic_read(&md->pending[READ]) +
@@ -645,7 +657,7 @@ static void clone_endio(struct bio *bio, int error)
                error = -EIO;
        if (endio) {
-                r = endio(tio->ti, bio, error);
+                r = endio(tio->ti, bio, error, &tio->info);
                if (r < 0 || r == DM_ENDIO_REQUEUE)
                        /*
                         * error and requeue request are handled
@@ -661,7 +673,13 @@ static void clone_endio(struct bio *bio, int error)
                }
        }
+        /*
+         * Store md for cleanup instead of tio which is about to get freed.
+         */
+        bio->bi_private = md->bs;
        free_tio(md, tio);
+        bio_put(bio);
        dec_pending(io, error);
 }
@@ -728,14 +746,8 @@ static void rq_completed(struct mapped_device *md, int rw, int run_queue)
        if (!md_in_flight(md))
                wake_up(&md->wait);
-        /*
-         * Run this off this callpath, as drivers could invoke end_io while
-         * inside their request_fn (and holding the queue lock). Calling
-         * back into ->request_fn() could deadlock attempting to grab the
-         * queue lock again.
-         */
        if (run_queue)
-                blk_run_queue_async(md->queue);
+                blk_run_queue(md->queue);
        /*
         * dm_put() must be at the end of this function. See the comment above
@@ -845,14 +857,10 @@ static void dm_done(struct request *clone, int error, bool mapped)
 {
        int r = error;
        struct dm_rq_target_io *tio = clone->end_io_data;
-        dm_request_endio_fn rq_end_io = NULL;
+        dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
-        if (tio->ti) {
+        if (mapped && rq_end_io)
-                rq_end_io = tio->ti->type->rq_end_io;
+                r = rq_end_io(tio->ti, clone, error, &tio->info);
-                if (mapped && rq_end_io)
-                        r = rq_end_io(tio->ti, clone, error, &tio->info);
-        }
        if (r <= 0)
                /* The target wants to complete the I/O */
@@ -952,47 +960,28 @@ static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti
 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
 {
        sector_t len = max_io_len_target_boundary(sector, ti);
-        sector_t offset, max_len;
        /*
-         * Does the target need to split even further?
+         * Does the target need to split even further ?
         */
-        if (ti->max_io_len) {
+        if (ti->split_io) {
-                offset = dm_target_offset(ti, sector);
+                sector_t boundary;
-                if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
+                sector_t offset = dm_target_offset(ti, sector);
-                        max_len = sector_div(offset, ti->max_io_len);
+                boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
-                else
+                           - offset;
-                        max_len = offset & (ti->max_io_len - 1);
+                if (len > boundary)
-                max_len = ti->max_io_len - max_len;
+                        len = boundary;
-                if (len > max_len)
-                        len = max_len;
        }
        return len;
 }
-int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
+static void __map_bio(struct dm_target *ti, struct bio *clone,
-{
+                      struct dm_target_io *tio)
-        if (len > UINT_MAX) {
-                DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
-                      (unsigned long long)len, UINT_MAX);
-                ti->error = "Maximum size of target IO is too large";
-                return -EINVAL;
-        }
-        ti->max_io_len = (uint32_t) len;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
-static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
 {
        int r;
        sector_t sector;
        struct mapped_device *md;
-        struct bio *clone = &tio->clone;
        clone->bi_end_io = clone_endio;
        clone->bi_private = tio;
@@ -1004,7 +993,7 @@ static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
         */
        atomic_inc(&tio->io->io_count);
        sector = clone->bi_sector;
-        r = ti->type->map(ti, clone);
+        r = ti->type->map(ti, clone, &tio->info);
        if (r == DM_MAPIO_REMAPPED) {
                /* the bio has been remapped so dispatch it */
@@ -1016,6 +1005,11 @@ static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
                /* error the io and bail out, or requeue it if needed */
                md = tio->io->md;
                dec_pending(tio->io, r);
+                /*
+                 * Store bio_set for cleanup.
+                 */
+                clone->bi_private = md->bs;
+                bio_put(clone);
                free_tio(md, tio);
        } else if (r) {
                DMWARN("unimplemented target map return value: %d", r);
@@ -1033,16 +1027,25 @@ struct clone_info {
        unsigned short idx;
 };
+static void dm_bio_destructor(struct bio *bio)
+{
+        struct bio_set *bs = bio->bi_private;
+        bio_free(bio, bs);
+}
 /*
 * Creates a little bio that just does part of a bvec.
 */
-static void split_bvec(struct dm_target_io *tio, struct bio *bio,
+static struct bio *split_bvec(struct bio *bio, sector_t sector,
-                       sector_t sector, unsigned short idx, unsigned int offset,
+                              unsigned short idx, unsigned int offset,
-                       unsigned int len, struct bio_set *bs)
+                              unsigned int len, struct bio_set *bs)
 {
-        struct bio *clone = &tio->clone;
+        struct bio *clone;
        struct bio_vec *bv = bio->bi_io_vec + idx;
+        clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
+        clone->bi_destructor = dm_bio_destructor;
        *clone->bi_io_vec = *bv;
        clone->bi_sector = sector;
@@ -1055,23 +1058,26 @@ static void split_bvec(struct dm_target_io *tio, struct bio *bio,
        clone->bi_flags |= 1 << BIO_CLONED;
        if (bio_integrity(bio)) {
-                bio_integrity_clone(clone, bio, GFP_NOIO);
+                bio_integrity_clone(clone, bio, GFP_NOIO, bs);
                bio_integrity_trim(clone,
                                   bio_sector_offset(bio, idx, offset), len);
        }
+        return clone;
 }
 /*
 * Creates a bio that consists of range of complete bvecs.
 */
-static void clone_bio(struct dm_target_io *tio, struct bio *bio,
+static struct bio *clone_bio(struct bio *bio, sector_t sector,
-                      sector_t sector, unsigned short idx,
+                             unsigned short idx, unsigned short bv_count,
-                      unsigned short bv_count, unsigned int len,
+                             unsigned int len, struct bio_set *bs)
-                      struct bio_set *bs)
 {
-        struct bio *clone = &tio->clone;
+        struct bio *clone;
+        clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
        __bio_clone(clone, bio);
+        clone->bi_destructor = dm_bio_destructor;
        clone->bi_sector = sector;
        clone->bi_idx = idx;
        clone->bi_vcnt = idx + bv_count;
@@ -1079,27 +1085,24 @@ static void clone_bio(struct dm_target_io *tio, struct bio *bio,
        clone->bi_flags &= ~(1 << BIO_SEG_VALID);
        if (bio_integrity(bio)) {
-                bio_integrity_clone(clone, bio, GFP_NOIO);
+                bio_integrity_clone(clone, bio, GFP_NOIO, bs);
                if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
                        bio_integrity_trim(clone,
                                           bio_sector_offset(bio, idx, 0), len);
        }
+        return clone;
 }
 static struct dm_target_io *alloc_tio(struct clone_info *ci,
-                                      struct dm_target *ti, int nr_iovecs)
+                                      struct dm_target *ti)
 {
-        struct dm_target_io *tio;
+        struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
-        struct bio *clone;
-        clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs);
-        tio = container_of(clone, struct dm_target_io, clone);
        tio->io = ci->io;
        tio->ti = ti;
        memset(&tio->info, 0, sizeof(tio->info));
-        tio->target_request_nr = 0;
        return tio;
 }
@@ -1107,24 +1110,25 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
                                   unsigned request_nr, sector_t len)
 {
-        struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs);
+        struct dm_target_io *tio = alloc_tio(ci, ti);
-        struct bio *clone = &tio->clone;
+        struct bio *clone;
-        tio->target_request_nr = request_nr;
+        tio->info.target_request_nr = request_nr;
        /*
         * Discard requests require the bio's inline iovecs be initialized.
         * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
         * and discard, so no need for concern about wasted bvec allocations.
         */
+        clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
-         __bio_clone(clone, ci->bio);
+        __bio_clone(clone, ci->bio);
+        clone->bi_destructor = dm_bio_destructor;
        if (len) {
                clone->bi_sector = ci->sector;
                clone->bi_size = to_bytes(len);
        }
-        __map_bio(ti, tio);
+        __map_bio(ti, clone, tio);
 }
 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
@@ -1153,38 +1157,18 @@ static int __clone_and_map_empty_flush(struct clone_info *ci)
 */
 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
 {
-        struct bio *bio = ci->bio;
+        struct bio *clone, *bio = ci->bio;
        struct dm_target_io *tio;
-        tio = alloc_tio(ci, ti, bio->bi_max_vecs);
+        tio = alloc_tio(ci, ti);
-        clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx,
+        clone = clone_bio(bio, ci->sector, ci->idx,
-                  ci->sector_count, ci->md->bs);
+                          bio->bi_vcnt - ci->idx, ci->sector_count,
-        __map_bio(ti, tio);
+                          ci->md->bs);
+        __map_bio(ti, clone, tio);
        ci->sector_count = 0;
 }
-typedef unsigned (*get_num_requests_fn)(struct dm_target *ti);
+static int __clone_and_map_discard(struct clone_info *ci)
-static unsigned get_num_discard_requests(struct dm_target *ti)
-{
-        return ti->num_discard_requests;
-}
-static unsigned get_num_write_same_requests(struct dm_target *ti)
-{
-        return ti->num_write_same_requests;
-}
-typedef bool (*is_split_required_fn)(struct dm_target *ti);
-static bool is_split_required_for_discard(struct dm_target *ti)
-{
-        return ti->split_discard_requests;
-}
-static int __clone_and_map_changing_extent_only(struct clone_info *ci,
-                                                get_num_requests_fn get_num_requests,
-                                                is_split_required_fn is_split_required)
 {
        struct dm_target *ti;
        sector_t len;
@@ -1195,18 +1179,15 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
                        return -EIO;
                /*
-                 * Even though the device advertised support for this type of
+                 * Even though the device advertised discard support,
-                 * request, that does not mean every target supports it, and
+                 * that does not mean every target supports it, and
                 * reconfiguration might also have changed that since the
                 * check was performed.
                 */
-                if (!get_num_requests || !get_num_requests(ti))
+                if (!ti->num_discard_requests)
                        return -EOPNOTSUPP;
-                if (is_split_required && !is_split_required(ti))
+                len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
-                        len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
-                else
-                        len = min(ci->sector_count, max_io_len(ci->sector, ti));
                __issue_target_requests(ci, ti, ti->num_discard_requests, len);
@@ -1216,28 +1197,15 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
        return 0;
 }
-static int __clone_and_map_discard(struct clone_info *ci)
-{
-        return __clone_and_map_changing_extent_only(ci, get_num_discard_requests,
-                                                    is_split_required_for_discard);
-}
-static int __clone_and_map_write_same(struct clone_info *ci)
-{
-        return __clone_and_map_changing_extent_only(ci, get_num_write_same_requests, NULL);
-}
 static int __clone_and_map(struct clone_info *ci)
 {
-        struct bio *bio = ci->bio;
+        struct bio *clone, *bio = ci->bio;
        struct dm_target *ti;
        sector_t len = 0, max;
        struct dm_target_io *tio;
        if (unlikely(bio->bi_rw & REQ_DISCARD))
                return __clone_and_map_discard(ci);
-        else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
-                return __clone_and_map_write_same(ci);
        ti = dm_table_find_target(ci->map, ci->sector);
        if (!dm_target_is_valid(ti))
@@ -1271,10 +1239,10 @@ static int __clone_and_map(struct clone_info *ci)
                        len += bv_len;
                }
-                tio = alloc_tio(ci, ti, bio->bi_max_vecs);
+                tio = alloc_tio(ci, ti);
-                clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len,
+                clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
-                          ci->md->bs);
+                                  ci->md->bs);
-                __map_bio(ti, tio);
+                __map_bio(ti, clone, tio);
                ci->sector += len;
                ci->sector_count -= len;
@@ -1299,11 +1267,12 @@ static int __clone_and_map(struct clone_info *ci)
                        len = min(remaining, max);
-                        tio = alloc_tio(ci, ti, 1);
+                        tio = alloc_tio(ci, ti);
-                        split_bvec(tio, bio, ci->sector, ci->idx,
+                        clone = split_bvec(bio, ci->sector, ci->idx,
-                                   bv->bv_offset + offset, len, ci->md->bs);
+                                           bv->bv_offset + offset, len,
+                                           ci->md->bs);
-                        __map_bio(ti, tio);
+                        __map_bio(ti, clone, tio);
                        ci->sector += len;
                        ci->sector_count -= len;
@@ -1422,7 +1391,7 @@ out:
 * The request function that just remaps the bio built up by
 * dm_merge_bvec.
 */
-static void _dm_request(struct request_queue *q, struct bio *bio)
+static int _dm_request(struct request_queue *q, struct bio *bio)
 {
        int rw = bio_data_dir(bio);
        struct mapped_device *md = q->queuedata;
@@ -1443,12 +1412,19 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
                        queue_io(md, bio);
                else
                        bio_io_error(bio);
-                return;
+                return 0;
        }
        __split_and_process_bio(md, bio);
        up_read(&md->io_lock);
-        return;
+        return 0;
+}
+static int dm_make_request(struct request_queue *q, struct bio *bio)
+{
+        struct mapped_device *md = q->queuedata;
+        return md->saved_make_request_fn(q, bio); /* call __make_request() */
 }
 static int dm_request_based(struct mapped_device *md)
@@ -1456,14 +1432,14 @@ static int dm_request_based(struct mapped_device *md)
        return blk_queue_stackable(md->queue);
 }
-static void dm_request(struct request_queue *q, struct bio *bio)
+static int dm_request(struct request_queue *q, struct bio *bio)
 {
        struct mapped_device *md = q->queuedata;
        if (dm_request_based(md))
-                blk_queue_bio(q, bio);
+                return dm_make_request(q, bio);
-        else
-                _dm_request(q, bio);
+        return _dm_request(q, bio);
 }
 void dm_dispatch_request(struct request *rq)
@@ -1480,17 +1456,30 @@ void dm_dispatch_request(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(dm_dispatch_request);
+static void dm_rq_bio_destructor(struct bio *bio)
+{
+        struct dm_rq_clone_bio_info *info = bio->bi_private;
+        struct mapped_device *md = info->tio->md;
+        free_bio_info(info);
+        bio_free(bio, md->bs);
+}
 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
                                 void *data)
 {
        struct dm_rq_target_io *tio = data;
-        struct dm_rq_clone_bio_info *info =
+        struct mapped_device *md = tio->md;
-                container_of(bio, struct dm_rq_clone_bio_info, clone);
+        struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
+        if (!info)
+                return -ENOMEM;
        info->orig = bio_orig;
        info->tio = tio;
        bio->bi_end_io = end_clone_bio;
        bio->bi_private = info;
+        bio->bi_destructor = dm_rq_bio_destructor;
        return 0;
 }
@@ -1575,6 +1564,15 @@ static int map_request(struct dm_target *ti, struct request *clone,
        int r, requeued = 0;
        struct dm_rq_target_io *tio = clone->end_io_data;
+        /*
+         * Hold the md reference here for the in-flight I/O.
+         * We can't rely on the reference count by device opener,
+         * because the device may be closed during the request completion
+         * when all bios are completed.
+         * See the comment in rq_completed() too.
+         */
+        dm_get(md);
        tio->ti = ti;
        r = ti->type->map_rq(ti, clone, &tio->info);
        switch (r) {
@@ -1606,26 +1604,6 @@ static int map_request(struct dm_target *ti, struct request *clone,
        return requeued;
 }
-static struct request *dm_start_request(struct mapped_device *md, struct request *orig)
-{
-        struct request *clone;
-        blk_start_request(orig);
-        clone = orig->special;
-        atomic_inc(&md->pending[rq_data_dir(clone)]);
-        /*
-         * Hold the md reference here for the in-flight I/O.
-         * We can't rely on the reference count by device opener,
-         * because the device may be closed during the request completion
-         * when all bios are completed.
-         * See the comment in rq_completed() too.
-         */
-        dm_get(md);
-        return clone;
-}
 /*
 * q->request_fn for request-based dm.
 * Called with the queue lock held.
@@ -1655,21 +1633,14 @@ static void dm_request_fn(struct request_queue *q)
                        pos = blk_rq_pos(rq);
                ti = dm_table_find_target(map, pos);
-                if (!dm_target_is_valid(ti)) {
+                BUG_ON(!dm_target_is_valid(ti));
-                        /*
-                         * Must perform setup, that dm_done() requires,
-                         * before calling dm_kill_unmapped_request
-                         */
-                        DMERR_LIMIT("request attempted access beyond the end of device");
-                        clone = dm_start_request(md, rq);
-                        dm_kill_unmapped_request(clone, -EIO);
-                        continue;
-                }
                if (ti->type->busy && ti->type->busy(ti))
                        goto delay_and_out;
-                clone = dm_start_request(md, rq);
+                blk_start_request(rq);
+                clone = rq->special;
+                atomic_inc(&md->pending[rq_data_dir(clone)]);
                spin_unlock(q->queue_lock);
                if (map_request(ti, clone, md))
@@ -1689,6 +1660,8 @@ delay_and_out:
        blk_delay_queue(q, HZ / 10);
 out:
        dm_table_put(map);
+        return;
 }
 int dm_underlying_device_busy(struct request_queue *q)
@@ -1969,20 +1942,13 @@ static void free_dev(struct mapped_device *md)
 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
-        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
+        struct dm_md_mempools *p;
-        if (md->io_pool && (md->tio_pool || dm_table_get_type(t) == DM_TYPE_BIO_BASED) && md->bs) {
+        if (md->io_pool && md->tio_pool && md->bs)
-                /*
+                /* the md already has necessary mempools */
-                 * The md already has necessary mempools. Reload just the
-                 * bioset because front_pad may have changed because
-                 * a different table was loaded.
-                 */
-                bioset_free(md->bs);
-                md->bs = p->bs;
-                p->bs = NULL;
                goto out;
-        }
+        p = dm_table_get_md_mempools(t);
        BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
        md->io_pool = p->io_pool;
@@ -2120,8 +2086,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
        write_lock_irqsave(&md->map_lock, flags);
        old_map = md->map;
        md->map = t;
-        md->immutable_target_type = dm_table_get_immutable_target_type(t);
        dm_table_set_restrictions(t, q, limits);
        if (merge_is_optional)
                set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
@@ -2192,11 +2156,6 @@ unsigned dm_get_md_type(struct mapped_device *md)
        return md->type;
 }
-struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
-{
-        return md->immutable_target_type;
-}
 /*
 * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
 */
@@ -2213,6 +2172,7 @@ static int dm_init_request_based_queue(struct mapped_device *md)
                return 0;
        md->queue = q;
+        md->saved_make_request_fn = md->queue->make_request_fn;
        dm_init_md_queue(md);
        blk_queue_softirq_done(md->queue, dm_softirq_done);
        blk_queue_prep_rq(md->queue, dm_prep_fn);
@@ -2271,7 +2231,6 @@ struct mapped_device *dm_get_md(dev_t dev)
        return md;
 }
-EXPORT_SYMBOL_GPL(dm_get_md);
 void *dm_get_mdptr(struct mapped_device *md)
 {
@@ -2357,6 +2316,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
        while (1) {
                set_current_state(interruptible);
+                smp_mb();
                if (!md_in_flight(md))
                        break;
@@ -2419,7 +2379,7 @@ static void dm_queue_flush(struct mapped_device *md)
 */
 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
-        struct dm_table *live_map, *map = ERR_PTR(-EINVAL);
+        struct dm_table *map = ERR_PTR(-EINVAL);
        struct queue_limits limits;
        int r;
@@ -2429,19 +2389,6 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
        if (!dm_suspended_md(md))
                goto out;
-        /*
-         * If the new table has no data devices, retain the existing limits.
-         * This helps multipath with queue_if_no_path if all paths disappear,
-         * then new I/O is queued based on these limits, and then some paths
-         * reappear.
-         */
-        if (dm_table_has_no_data_devices(table)) {
-                live_map = dm_get_live_table(md);
-                if (live_map)
-                        limits = md->queue->limits;
-                dm_table_put(live_map);
-        }
        r = dm_calculate_queue_limits(table, &limits);
        if (r) {
                map = ERR_PTR(r);
@@ -2741,7 +2688,7 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
 {
        struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
        unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
@@ -2749,26 +2696,19 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
        if (!pools)
                return NULL;
-        per_bio_data_size = roundup(per_bio_data_size, __alignof__(struct dm_target_io));
        pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
                         mempool_create_slab_pool(MIN_IOS, _io_cache) :
                         mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
        if (!pools->io_pool)
                goto free_pools_and_out;
-        pools->tio_pool = NULL;
+        pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
-        if (type == DM_TYPE_REQUEST_BASED) {
+                          mempool_create_slab_pool(MIN_IOS, _tio_cache) :
-                pools->tio_pool = mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
+                          mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
-                if (!pools->tio_pool)
+        if (!pools->tio_pool)
-                        goto free_io_pool_and_out;
+                goto free_io_pool_and_out;
-        }
-        pools->bs = (type == DM_TYPE_BIO_BASED) ?
+        pools->bs = bioset_create(pool_size, 0);
-                bioset_create(pool_size,
-                              per_bio_data_size + offsetof(struct dm_target_io, clone)) :
-                bioset_create(pool_size,
-                              offsetof(struct dm_rq_clone_bio_info, clone));
        if (!pools->bs)
                goto free_tio_pool_and_out;
@@ -2781,8 +2721,7 @@ free_bioset_and_out:
        bioset_free(pools->bs);
 free_tio_pool_and_out:
-        if (pools->tio_pool)
+        mempool_destroy(pools->tio_pool);
-                mempool_destroy(pools->tio_pool);
 free_io_pool_and_out:
        mempool_destroy(pools->io_pool);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 45b97da1bd0..6745dbd278a 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -23,11 +23,6 @@
 #define DM_SUSPEND_NOFLUSH_FLAG         (1 << 1)
 /*
- * Status feature flags
- */
-#define DM_STATUS_NOFLUSH_FLAG          (1 << 0)
-/*
 * Type of table and mapped_device's mempool
 */
 #define DM_TYPE_NONE            0
@@ -54,7 +49,6 @@ void dm_table_event_callback(struct dm_table *t,
                             void (*fn)(void *), void *context);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
-bool dm_table_has_no_data_devices(struct dm_table *table);
 int dm_calculate_queue_limits(struct dm_table *table,
                              struct queue_limits *limits);
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -66,7 +60,6 @@ int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 int dm_table_any_busy_target(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
-struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
 bool dm_table_supports_discards(struct dm_table *t);
 int dm_table_alloc_md_mempools(struct dm_table *t);
@@ -79,7 +72,6 @@ void dm_lock_md_type(struct mapped_device *md);
 void dm_unlock_md_type(struct mapped_device *md);
 void dm_set_md_type(struct mapped_device *md, unsigned type);
 unsigned dm_get_md_type(struct mapped_device *md);
-struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
 int dm_setup_md_queue(struct mapped_device *md);
@@ -159,7 +151,7 @@ void dm_kcopyd_exit(void);
 /*
 * Mempool operations
 */
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 #endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 5e7dc772f5d..23078dabb6d 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -63,7 +63,6 @@
 #define MaxFault        50
 #include <linux/blkdev.h>
-#include <linux/module.h>
 #include <linux/raid/md_u.h>
 #include <linux/slab.h>
 #include "md.h"
@@ -82,16 +81,16 @@ static void faulty_fail(struct bio *bio, int error)
        bio_io_error(b);
 }
-struct faulty_conf {
+typedef struct faulty_conf {
        int period[Modes];
        atomic_t counters[Modes];
        sector_t faults[MaxFault];
        int     modes[MaxFault];
        int nfaults;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
-};
+} conf_t;
-static int check_mode(struct faulty_conf *conf, int mode)
+static int check_mode(conf_t *conf, int mode)
 {
        if (conf->period[mode] == 0 &&
            atomic_read(&conf->counters[mode]) <= 0)
@@ -106,7 +105,7 @@ static int check_mode(struct faulty_conf *conf, int mode)
        return 0;
 }
-static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir)
+static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir)
 {
        /* If we find a ReadFixable sector, we fix it ... */
        int i;
@@ -130,7 +129,7 @@ static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end,
        return 0;
 }
-static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
+static void add_sector(conf_t *conf, sector_t start, int mode)
 {
        int i;
        int n = conf->nfaults;
@@ -170,9 +169,9 @@ static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
                conf->nfaults = n+1;
 }
-static void make_request(struct mddev *mddev, struct bio *bio)
+static int make_request(mddev_t *mddev, struct bio *bio)
 {
-        struct faulty_conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int failit = 0;
        if (bio_data_dir(bio) == WRITE) {
@@ -182,7 +181,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
                         * just fail immediately
                         */
                        bio_endio(bio, -EIO);
-                        return;
+                        return 0;
                }
                if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
@@ -212,20 +211,20 @@ static void make_request(struct mddev *mddev, struct bio *bio)
        }
        if (failit) {
                struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
                b->bi_bdev = conf->rdev->bdev;
                b->bi_private = bio;
                b->bi_end_io = faulty_fail;
-                bio = b;
+                generic_make_request(b);
-        } else
+                return 0;
+        } else {
                bio->bi_bdev = conf->rdev->bdev;
+                return 1;
-        generic_make_request(bio);
+        }
 }
-static void status(struct seq_file *seq, struct mddev *mddev)
+static void status(struct seq_file *seq, mddev_t *mddev)
 {
-        struct faulty_conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int n;
        if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
@@ -256,11 +255,11 @@ static void status(struct seq_file *seq, struct mddev *mddev)
 }
-static int reshape(struct mddev *mddev)
+static int reshape(mddev_t *mddev)
 {
        int mode = mddev->new_layout & ModeMask;
        int count = mddev->new_layout >> ModeShift;
-        struct faulty_conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        if (mddev->new_layout < 0)
                return 0;
@@ -285,7 +284,7 @@ static int reshape(struct mddev *mddev)
        return 0;
 }
-static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
        WARN_ONCE(raid_disks,
                  "%s does not support generic reshape\n", __func__);
@@ -296,11 +295,11 @@ static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disk
        return sectors;
 }
-static int run(struct mddev *mddev)
+static int run(mddev_t *mddev)
 {
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        int i;
-        struct faulty_conf *conf;
+        conf_t *conf;
        if (md_check_no_bitmap(mddev))
                return -EINVAL;
@@ -315,11 +314,8 @@ static int run(struct mddev *mddev)
        }
        conf->nfaults = 0;
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                conf->rdev = rdev;
-                disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                  rdev->data_offset << 9);
-        }
        md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
        mddev->private = conf;
@@ -329,16 +325,16 @@ static int run(struct mddev *mddev)
        return 0;
 }
-static int stop(struct mddev *mddev)
+static int stop(mddev_t *mddev)
 {
-        struct faulty_conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        kfree(conf);
        mddev->private = NULL;
        return 0;
 }
-static struct md_personality faulty_personality =
+static struct mdk_personality faulty_personality =
 {
        .name           = "faulty",
        .level          = LEVEL_FAULTY,
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 21014836bdb..6cd2c313e80 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -19,7 +19,6 @@
 #include <linux/blkdev.h>
 #include <linux/raid/md_u.h>
 #include <linux/seq_file.h>
-#include <linux/module.h>
 #include <linux/slab.h>
 #include "md.h"
 #include "linear.h"
@@ -27,10 +26,10 @@
 /*
 * find which device holds a particular offset 
 */
-static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
+static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 {
        int lo, mid, hi;
-        struct linear_conf *conf;
+        linear_conf_t *conf;
        lo = 0;
        hi = mddev->raid_disks - 1;
@@ -64,23 +63,14 @@ static int linear_mergeable_bvec(struct request_queue *q,
                                 struct bvec_merge_data *bvm,
                                 struct bio_vec *biovec)
 {
-        struct mddev *mddev = q->queuedata;
+        mddev_t *mddev = q->queuedata;
-        struct dev_info *dev0;
+        dev_info_t *dev0;
        unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
-        int maxbytes = biovec->bv_len;
-        struct request_queue *subq;
        rcu_read_lock();
        dev0 = which_dev(mddev, sector);
        maxsectors = dev0->end_sector - sector;
-        subq = bdev_get_queue(dev0->rdev->bdev);
-        if (subq->merge_bvec_fn) {
-                bvm->bi_bdev = dev0->rdev->bdev;
-                bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
-                maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
-                                                             biovec));
-        }
        rcu_read_unlock();
        if (maxsectors < bio_sectors)
@@ -89,18 +79,18 @@ static int linear_mergeable_bvec(struct request_queue *q,
                maxsectors -= bio_sectors;
        if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
-                return maxbytes;
+                return biovec->bv_len;
+        /* The bytes available at this offset could be really big,
-        if (maxsectors > (maxbytes >> 9))
+         * so we cap at 2^31 to avoid overflow */
-                return maxbytes;
+        if (maxsectors > (1 << (31-9)))
-        else
+                return 1<<31;
-                return maxsectors << 9;
+        return maxsectors << 9;
 }
 static int linear_congested(void *data, int bits)
 {
-        struct mddev *mddev = data;
+        mddev_t *mddev = data;
-        struct linear_conf *conf;
+        linear_conf_t *conf;
        int i, ret = 0;
        if (mddev_congested(mddev, bits))
@@ -118,9 +108,9 @@ static int linear_congested(void *data, int bits)
        return ret;
 }
-static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
-        struct linear_conf *conf;
+        linear_conf_t *conf;
        sector_t array_sectors;
        rcu_read_lock();
@@ -133,14 +123,13 @@ static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disk
        return array_sectors;
 }
-static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
+static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 {
-        struct linear_conf *conf;
+        linear_conf_t *conf;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        int i, cnt;
-        bool discard_supported = false;
-        conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
+        conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
                        GFP_KERNEL);
        if (!conf)
                return NULL;
@@ -148,9 +137,9 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
        cnt = 0;
        conf->array_sectors = 0;
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                int j = rdev->raid_disk;
-                struct dev_info *disk = conf->disks + j;
+                dev_info_t *disk = conf->disks + j;
                sector_t sectors;
                if (j < 0 || j >= raid_disks || disk->rdev) {
@@ -168,12 +157,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
                disk_stack_limits(mddev->gendisk, rdev->bdev,
                                  rdev->data_offset << 9);
+                /* as we don't honour merge_bvec_fn, we must never risk
+                 * violating it, so limit max_segments to 1 lying within
+                 * a single page.
+                 */
+                if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+                        blk_queue_max_segments(mddev->queue, 1);
+                        blk_queue_segment_boundary(mddev->queue,
+                                                   PAGE_CACHE_SIZE - 1);
+                }
                conf->array_sectors += rdev->sectors;
                cnt++;
-                if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
-                        discard_supported = true;
        }
        if (cnt != raid_disks) {
                printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
@@ -181,11 +177,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
                goto out;
        }
-        if (!discard_supported)
-                queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
-        else
-                queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
        /*
         * Here we calculate the device offsets.
         */
@@ -203,10 +194,9 @@ out:
        return NULL;
 }
-static int linear_run (struct mddev *mddev)
+static int linear_run (mddev_t *mddev)
 {
-        struct linear_conf *conf;
+        linear_conf_t *conf;
-        int ret;
        if (md_check_no_bitmap(mddev))
                return -EINVAL;
@@ -220,16 +210,10 @@ static int linear_run (struct mddev *mddev)
        blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
        mddev->queue->backing_dev_info.congested_fn = linear_congested;
        mddev->queue->backing_dev_info.congested_data = mddev;
+        return md_integrity_register(mddev);
-        ret =  md_integrity_register(mddev);
-        if (ret) {
-                kfree(conf);
-                mddev->private = NULL;
-        }
-        return ret;
 }
-static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
+static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        /* Adding a drive to a linear array allows the array to grow.
         * It is permitted if the new drive has a matching superblock
@@ -239,22 +223,19 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
         * The current one is never freed until the array is stopped.
         * This avoids races.
         */
-        struct linear_conf *newconf, *oldconf;
+        linear_conf_t *newconf, *oldconf;
        if (rdev->saved_raid_disk != mddev->raid_disks)
                return -EINVAL;
        rdev->raid_disk = rdev->saved_raid_disk;
-        rdev->saved_raid_disk = -1;
        newconf = linear_conf(mddev,mddev->raid_disks+1);
        if (!newconf)
                return -ENOMEM;
-        oldconf = rcu_dereference_protected(mddev->private,
+        oldconf = rcu_dereference(mddev->private);
-                                            lockdep_is_held(
-                                                    &mddev->reconfig_mutex));
        mddev->raid_disks++;
        rcu_assign_pointer(mddev->private, newconf);
        md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
@@ -264,12 +245,9 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
        return 0;
 }
-static int linear_stop (struct mddev *mddev)
+static int linear_stop (mddev_t *mddev)
 {
-        struct linear_conf *conf =
+        linear_conf_t *conf = mddev->private;
-                rcu_dereference_protected(mddev->private,
-                                          lockdep_is_held(
-                                                  &mddev->reconfig_mutex));
        /*
         * We do not require rcu protection here since
@@ -286,14 +264,14 @@ static int linear_stop (struct mddev *mddev)
        return 0;
 }
-static void linear_make_request(struct mddev *mddev, struct bio *bio)
+static int linear_make_request (mddev_t *mddev, struct bio *bio)
 {
-        struct dev_info *tmp_dev;
+        dev_info_t *tmp_dev;
        sector_t start_sector;
        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
                md_flush_request(mddev, bio);
-                return;
+                return 0;
        }
        rcu_read_lock();
@@ -315,7 +293,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
                       (unsigned long long)start_sector);
                rcu_read_unlock();
                bio_io_error(bio);
-                return;
+                return 0;
        }
        if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
                     tmp_dev->end_sector)) {
@@ -329,10 +307,12 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
                bp = bio_split(bio, end_sector - bio->bi_sector);
-                linear_make_request(mddev, &bp->bio1);
+                if (linear_make_request(mddev, &bp->bio1))
-                linear_make_request(mddev, &bp->bio2);
+                        generic_make_request(&bp->bio1);
+                if (linear_make_request(mddev, &bp->bio2))
+                        generic_make_request(&bp->bio2);
                bio_pair_release(bp);
-                return;
+                return 0;
        }
                    
        bio->bi_bdev = tmp_dev->rdev->bdev;
@@ -340,24 +320,17 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
                + tmp_dev->rdev->data_offset;
        rcu_read_unlock();
-        if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+        return 1;
-                     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
-                /* Just ignore it */
-                bio_endio(bio, 0);
-                return;
-        }
-        generic_make_request(bio);
 }
-static void linear_status (struct seq_file *seq, struct mddev *mddev)
+static void linear_status (struct seq_file *seq, mddev_t *mddev)
 {
        seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
 }
-static struct md_personality linear_personality =
+static struct mdk_personality linear_personality =
 {
        .name           = "linear",
        .level          = LEVEL_LINEAR,
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index b685ddd7d7f..2f2da05b2ce 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -2,14 +2,20 @@
 #define _LINEAR_H
 struct dev_info {
-        struct md_rdev  *rdev;
+        mdk_rdev_t      *rdev;
        sector_t        end_sector;
 };
-struct linear_conf
+typedef struct dev_info dev_info_t;
+struct linear_private_data
 {
        struct rcu_head         rcu;
        sector_t                array_sectors;
-        struct dev_info         disks[0];
+        dev_info_t              disks[0];
 };
+typedef struct linear_private_data linear_conf_t;
 #endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3db3d1b271f..5c95ccb5950 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -36,14 +36,14 @@
 #include <linux/blkdev.h>
 #include <linux/sysctl.h>
 #include <linux/seq_file.h>
-#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/poll.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/hdreg.h>
 #include <linux/proc_fs.h>
 #include <linux/random.h>
-#include <linux/module.h>
 #include <linux/reboot.h>
 #include <linux/file.h>
 #include <linux/compat.h>
@@ -54,6 +54,9 @@
 #include "md.h"
 #include "bitmap.h"
+#define DEBUG 0
+#define dprintk(x...) ((void)(DEBUG && printk(x)))
 #ifndef MODULE
 static void autostart_arrays(int part);
 #endif
@@ -95,13 +98,13 @@ static struct workqueue_struct *md_misc_wq;
 static int sysctl_speed_limit_min = 1000;
 static int sysctl_speed_limit_max = 200000;
-static inline int speed_min(struct mddev *mddev)
+static inline int speed_min(mddev_t *mddev)
 {
        return mddev->sync_speed_min ?
                mddev->sync_speed_min : sysctl_speed_limit_min;
 }
-static inline int speed_max(struct mddev *mddev)
+static inline int speed_max(mddev_t *mddev)
 {
        return mddev->sync_speed_max ?
                mddev->sync_speed_max : sysctl_speed_limit_max;
@@ -155,28 +158,65 @@ static int start_readonly;
 * like bio_clone, but with a local bio set
 */
+static void mddev_bio_destructor(struct bio *bio)
+{
+        mddev_t *mddev, **mddevp;
+        mddevp = (void*)bio;
+        mddev = mddevp[-1];
+        bio_free(bio, mddev->bio_set);
+}
 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
-                            struct mddev *mddev)
+                            mddev_t *mddev)
 {
        struct bio *b;
+        mddev_t **mddevp;
        if (!mddev || !mddev->bio_set)
                return bio_alloc(gfp_mask, nr_iovecs);
-        b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
+        b = bio_alloc_bioset(gfp_mask, nr_iovecs,
+                             mddev->bio_set);
        if (!b)
                return NULL;
+        mddevp = (void*)b;
+        mddevp[-1] = mddev;
+        b->bi_destructor = mddev_bio_destructor;
        return b;
 }
 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
-                            struct mddev *mddev)
+                            mddev_t *mddev)
 {
+        struct bio *b;
+        mddev_t **mddevp;
        if (!mddev || !mddev->bio_set)
                return bio_clone(bio, gfp_mask);
-        return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
+        b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
+                             mddev->bio_set);
+        if (!b)
+                return NULL;
+        mddevp = (void*)b;
+        mddevp[-1] = mddev;
+        b->bi_destructor = mddev_bio_destructor;
+        __bio_clone(b, bio);
+        if (bio_integrity(bio)) {
+                int ret;
+                ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
+                if (ret < 0) {
+                        bio_put(b);
+                        return NULL;
+                }
+        }
+        return b;
 }
 EXPORT_SYMBOL_GPL(bio_clone_mddev);
@@ -241,7 +281,7 @@ EXPORT_SYMBOL_GPL(md_trim_bio);
 */
 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 static atomic_t md_event_count;
-void md_new_event(struct mddev *mddev)
+void md_new_event(mddev_t *mddev)
 {
        atomic_inc(&md_event_count);
        wake_up(&md_event_waiters);
@@ -251,7 +291,7 @@ EXPORT_SYMBOL_GPL(md_new_event);
 /* Alternate version that can be called from interrupts
 * when calling sysfs_notify isn't needed.
 */
-static void md_new_event_inintr(struct mddev *mddev)
+static void md_new_event_inintr(mddev_t *mddev)
 {
        atomic_inc(&md_event_count);
        wake_up(&md_event_waiters);
@@ -272,19 +312,19 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
 * Any code which breaks out of this loop while own
 * a reference to the current mddev and must mddev_put it.
 */
-#define for_each_mddev(_mddev,_tmp)                                     \
+#define for_each_mddev(mddev,tmp)                                       \
                                                                        \
        for (({ spin_lock(&all_mddevs_lock);                            \
-                _tmp = all_mddevs.next;                                 \
+                tmp = all_mddevs.next;                                  \
-                _mddev = NULL;});                                       \
+                mddev = NULL;});                                        \
-             ({ if (_tmp != &all_mddevs)                                \
+             ({ if (tmp != &all_mddevs)                                 \
-                        mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
+                        mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
                spin_unlock(&all_mddevs_lock);                          \
-                if (_mddev) mddev_put(_mddev);                          \
+                if (mddev) mddev_put(mddev);                            \
-                _mddev = list_entry(_tmp, struct mddev, all_mddevs);    \
+                mddev = list_entry(tmp, mddev_t, all_mddevs);           \
-                _tmp != &all_mddevs;});                                 \
+                tmp != &all_mddevs;});                                  \
             ({ spin_lock(&all_mddevs_lock);                            \
-                _tmp = _tmp->next;})                                    \
+                tmp = tmp->next;})                                      \
                )
@@ -295,17 +335,18 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
 * call has finished, the bio has been linked into some internal structure
 * and so is visible to ->quiesce(), so we don't need the refcount any more.
 */
-static void md_make_request(struct request_queue *q, struct bio *bio)
+static int md_make_request(struct request_queue *q, struct bio *bio)
 {
        const int rw = bio_data_dir(bio);
-        struct mddev *mddev = q->queuedata;
+        mddev_t *mddev = q->queuedata;
+        int rv;
        int cpu;
        unsigned int sectors;
        if (mddev == NULL || mddev->pers == NULL
            || !mddev->ready) {
                bio_io_error(bio);
-                return;
+                return 0;
        }
        smp_rmb(); /* Ensure implications of  'active' are visible */
        rcu_read_lock();
@@ -330,7 +371,7 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
         * go away inside make_request
         */
        sectors = bio_sectors(bio);
-        mddev->pers->make_request(mddev, bio);
+        rv = mddev->pers->make_request(mddev, bio);
        cpu = part_stat_lock();
        part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
@@ -339,6 +380,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
        if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
                wake_up(&mddev->sb_wait);
+        return rv;
 }
 /* mddev_suspend makes sure no new requests are submitted
@@ -347,31 +390,28 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
 * Once ->stop is called and completes, the module will be completely
 * unused.
 */
-void mddev_suspend(struct mddev *mddev)
+void mddev_suspend(mddev_t *mddev)
 {
        BUG_ON(mddev->suspended);
        mddev->suspended = 1;
        synchronize_rcu();
        wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
        mddev->pers->quiesce(mddev, 1);
-        del_timer_sync(&mddev->safemode_timer);
 }
 EXPORT_SYMBOL_GPL(mddev_suspend);
-void mddev_resume(struct mddev *mddev)
+void mddev_resume(mddev_t *mddev)
 {
        mddev->suspended = 0;
        wake_up(&mddev->sb_wait);
        mddev->pers->quiesce(mddev, 0);
-        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 }
 EXPORT_SYMBOL_GPL(mddev_resume);
-int mddev_congested(struct mddev *mddev, int bits)
+int mddev_congested(mddev_t *mddev, int bits)
 {
        return mddev->suspended;
 }
@@ -383,8 +423,8 @@ EXPORT_SYMBOL(mddev_congested);
 static void md_end_flush(struct bio *bio, int err)
 {
-        struct md_rdev *rdev = bio->bi_private;
+        mdk_rdev_t *rdev = bio->bi_private;
-        struct mddev *mddev = rdev->mddev;
+        mddev_t *mddev = rdev->mddev;
        rdev_dec_pending(rdev, mddev);
@@ -399,13 +439,13 @@ static void md_submit_flush_data(struct work_struct *ws);
 static void submit_flushes(struct work_struct *ws)
 {
-        struct mddev *mddev = container_of(ws, struct mddev, flush_work);
+        mddev_t *mddev = container_of(ws, mddev_t, flush_work);
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        INIT_WORK(&mddev->flush_work, md_submit_flush_data);
        atomic_set(&mddev->flush_pending, 1);
        rcu_read_lock();
-        rdev_for_each_rcu(rdev, mddev)
+        list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0 &&
                    !test_bit(Faulty, &rdev->flags)) {
                        /* Take two references, one is dropped
@@ -416,7 +456,7 @@ static void submit_flushes(struct work_struct *ws)
                        atomic_inc(&rdev->nr_pending);
                        atomic_inc(&rdev->nr_pending);
                        rcu_read_unlock();
-                        bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
+                        bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
                        bi->bi_end_io = md_end_flush;
                        bi->bi_private = rdev;
                        bi->bi_bdev = rdev->bdev;
@@ -432,7 +472,7 @@ static void submit_flushes(struct work_struct *ws)
 static void md_submit_flush_data(struct work_struct *ws)
 {
-        struct mddev *mddev = container_of(ws, struct mddev, flush_work);
+        mddev_t *mddev = container_of(ws, mddev_t, flush_work);
        struct bio *bio = mddev->flush_bio;
        if (bio->bi_size == 0)
@@ -440,19 +480,20 @@ static void md_submit_flush_data(struct work_struct *ws)
                bio_endio(bio, 0);
        else {
                bio->bi_rw &= ~REQ_FLUSH;
-                mddev->pers->make_request(mddev, bio);
+                if (mddev->pers->make_request(mddev, bio))
+                        generic_make_request(bio);
        }
        mddev->flush_bio = NULL;
        wake_up(&mddev->sb_wait);
 }
-void md_flush_request(struct mddev *mddev, struct bio *bio)
+void md_flush_request(mddev_t *mddev, struct bio *bio)
 {
        spin_lock_irq(&mddev->write_lock);
        wait_event_lock_irq(mddev->sb_wait,
                            !mddev->flush_bio,
-                            mddev->write_lock);
+                            mddev->write_lock, /*nothing*/);
        mddev->flush_bio = bio;
        spin_unlock_irq(&mddev->write_lock);
@@ -461,15 +502,63 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
 }
 EXPORT_SYMBOL(md_flush_request);
-void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
+/* Support for plugging.
+ * This mirrors the plugging support in request_queue, but does not
+ * require having a whole queue or request structures.
+ * We allocate an md_plug_cb for each md device and each thread it gets
+ * plugged on.  This links tot the private plug_handle structure in the
+ * personality data where we keep a count of the number of outstanding
+ * plugs so other code can see if a plug is active.
+ */
+struct md_plug_cb {
+        struct blk_plug_cb cb;
+        mddev_t *mddev;
+};
+static void plugger_unplug(struct blk_plug_cb *cb)
 {
-        struct mddev *mddev = cb->data;
+        struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
-        md_wakeup_thread(mddev->thread);
+        if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
-        kfree(cb);
+                md_wakeup_thread(mdcb->mddev->thread);
+        kfree(mdcb);
 }
-EXPORT_SYMBOL(md_unplug);
-static inline struct mddev *mddev_get(struct mddev *mddev)
+/* Check that an unplug wakeup will come shortly.
+ * If not, wakeup the md thread immediately
+ */
+int mddev_check_plugged(mddev_t *mddev)
+{
+        struct blk_plug *plug = current->plug;
+        struct md_plug_cb *mdcb;
+        if (!plug)
+                return 0;
+        list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
+                if (mdcb->cb.callback == plugger_unplug &&
+                    mdcb->mddev == mddev) {
+                        /* Already on the list, move to top */
+                        if (mdcb != list_first_entry(&plug->cb_list,
+                                                    struct md_plug_cb,
+                                                    cb.list))
+                                list_move(&mdcb->cb.list, &plug->cb_list);
+                        return 1;
+                }
+        }
+        /* Not currently on the callback list */
+        mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
+        if (!mdcb)
+                return 0;
+        mdcb->mddev = mddev;
+        mdcb->cb.callback = plugger_unplug;
+        atomic_inc(&mddev->plug_cnt);
+        list_add(&mdcb->cb.list, &plug->cb_list);
+        return 1;
+}
+EXPORT_SYMBOL_GPL(mddev_check_plugged);
+static inline mddev_t *mddev_get(mddev_t *mddev)
 {
        atomic_inc(&mddev->active);
        return mddev;
@@ -477,7 +566,7 @@ static inline struct mddev *mddev_get(struct mddev *mddev)
 static void mddev_delayed_delete(struct work_struct *ws);
-static void mddev_put(struct mddev *mddev)
+static void mddev_put(mddev_t *mddev)
 {
        struct bio_set *bs = NULL;
@@ -487,7 +576,7 @@ static void mddev_put(struct mddev *mddev)
            mddev->ctime == 0 && !mddev->hold_active) {
                /* Array is not configured at all, and not held active,
                 * so destroy it */
-                list_del_init(&mddev->all_mddevs);
+                list_del(&mddev->all_mddevs);
                bs = mddev->bio_set;
                mddev->bio_set = NULL;
                if (mddev->gendisk) {
@@ -506,7 +595,7 @@ static void mddev_put(struct mddev *mddev)
                bioset_free(bs);
 }
-void mddev_init(struct mddev *mddev)
+void mddev_init(mddev_t *mddev)
 {
        mutex_init(&mddev->open_mutex);
        mutex_init(&mddev->reconfig_mutex);
@@ -517,21 +606,21 @@ void mddev_init(struct mddev *mddev)
        atomic_set(&mddev->active, 1);
        atomic_set(&mddev->openers, 0);
        atomic_set(&mddev->active_io, 0);
+        atomic_set(&mddev->plug_cnt, 0);
        spin_lock_init(&mddev->write_lock);
        atomic_set(&mddev->flush_pending, 0);
        init_waitqueue_head(&mddev->sb_wait);
        init_waitqueue_head(&mddev->recovery_wait);
        mddev->reshape_position = MaxSector;
-        mddev->reshape_backwards = 0;
        mddev->resync_min = 0;
        mddev->resync_max = MaxSector;
        mddev->level = LEVEL_NONE;
 }
 EXPORT_SYMBOL_GPL(mddev_init);
-static struct mddev * mddev_find(dev_t unit)
+static mddev_t * mddev_find(dev_t unit)
 {
-        struct mddev *mddev, *new = NULL;
+        mddev_t *mddev, *new = NULL;
        if (unit && MAJOR(unit) != MD_MAJOR)
                unit &= ~((1<<MdpMinorShift)-1);
@@ -603,24 +692,24 @@ static struct mddev * mddev_find(dev_t unit)
        goto retry;
 }
-static inline int mddev_lock(struct mddev * mddev)
+static inline int mddev_lock(mddev_t * mddev)
 {
        return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
-static inline int mddev_is_locked(struct mddev *mddev)
+static inline int mddev_is_locked(mddev_t *mddev)
 {
        return mutex_is_locked(&mddev->reconfig_mutex);
 }
-static inline int mddev_trylock(struct mddev * mddev)
+static inline int mddev_trylock(mddev_t * mddev)
 {
        return mutex_trylock(&mddev->reconfig_mutex);
 }
 static struct attribute_group md_redundancy_group;
-static void mddev_unlock(struct mddev * mddev)
+static void mddev_unlock(mddev_t * mddev)
 {
        if (mddev->to_remove) {
                /* These cannot be removed under reconfig_mutex as
@@ -655,61 +744,39 @@ static void mddev_unlock(struct mddev * mddev)
        } else
                mutex_unlock(&mddev->reconfig_mutex);
-        /* As we've dropped the mutex we need a spinlock to
+        /* was we've dropped the mutex we need a spinlock to
-         * make sure the thread doesn't disappear
+         * make sur the thread doesn't disappear
         */
        spin_lock(&pers_lock);
        md_wakeup_thread(mddev->thread);
        spin_unlock(&pers_lock);
 }
-static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
+static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
 {
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
-        rdev_for_each(rdev, mddev)
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->desc_nr == nr)
                        return rdev;
        return NULL;
 }
-static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
+static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
-{
-        struct md_rdev *rdev;
-        rdev_for_each_rcu(rdev, mddev)
-                if (rdev->desc_nr == nr)
-                        return rdev;
-        return NULL;
-}
-static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
-{
-        struct md_rdev *rdev;
-        rdev_for_each(rdev, mddev)
-                if (rdev->bdev->bd_dev == dev)
-                        return rdev;
-        return NULL;
-}
-static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
 {
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
-        rdev_for_each_rcu(rdev, mddev)
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->bdev->bd_dev == dev)
                        return rdev;
        return NULL;
 }
-static struct md_personality *find_pers(int level, char *clevel)
+static struct mdk_personality *find_pers(int level, char *clevel)
 {
-        struct md_personality *pers;
+        struct mdk_personality *pers;
        list_for_each_entry(pers, &pers_list, list) {
                if (level != LEVEL_NONE && pers->level == level)
                        return pers;
@@ -720,13 +787,13 @@ static struct md_personality *find_pers(int level, char *clevel)
 }
 /* return the offset of the super block in 512byte sectors */
-static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
+static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
 {
        sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
        return MD_NEW_SIZE_SECTORS(num_sectors);
 }
-static int alloc_disk_sb(struct md_rdev * rdev)
+static int alloc_disk_sb(mdk_rdev_t * rdev)
 {
        if (rdev->sb_page)
                MD_BUG();
@@ -740,7 +807,7 @@ static int alloc_disk_sb(struct md_rdev * rdev)
        return 0;
 }
-void md_rdev_clear(struct md_rdev *rdev)
+static void free_disk_sb(mdk_rdev_t * rdev)
 {
        if (rdev->sb_page) {
                put_page(rdev->sb_page);
@@ -753,15 +820,13 @@ void md_rdev_clear(struct md_rdev *rdev)
                put_page(rdev->bb_page);
                rdev->bb_page = NULL;
        }
-        kfree(rdev->badblocks.page);
-        rdev->badblocks.page = NULL;
 }
-EXPORT_SYMBOL_GPL(md_rdev_clear);
 static void super_written(struct bio *bio, int error)
 {
-        struct md_rdev *rdev = bio->bi_private;
+        mdk_rdev_t *rdev = bio->bi_private;
-        struct mddev *mddev = rdev->mddev;
+        mddev_t *mddev = rdev->mddev;
        if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
                printk("md: super_written gets error=%d, uptodate=%d\n",
@@ -775,7 +840,7 @@ static void super_written(struct bio *bio, int error)
        bio_put(bio);
 }
-void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
+void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
                   sector_t sector, int size, struct page *page)
 {
        /* write first size bytes of page to sector of rdev
@@ -796,7 +861,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
        submit_bio(WRITE_FLUSH_FUA, bio);
 }
-void md_super_wait(struct mddev *mddev)
+void md_super_wait(mddev_t *mddev)
 {
        /* wait for all superblock writes that were scheduled to complete */
        DEFINE_WAIT(wq);
@@ -814,7 +879,7 @@ static void bi_complete(struct bio *bio, int error)
        complete((struct completion*)bio->bi_private);
 }
-int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
+int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
                 struct page *page, int rw, bool metadata_op)
 {
        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
@@ -827,10 +892,6 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
                rdev->meta_bdev : rdev->bdev;
        if (metadata_op)
                bio->bi_sector = sector + rdev->sb_start;
-        else if (rdev->mddev->reshape_position != MaxSector &&
-                 (rdev->mddev->reshape_backwards ==
-                  (sector >= rdev->mddev->reshape_position)))
-                bio->bi_sector = sector + rdev->new_data_offset;
        else
                bio->bi_sector = sector + rdev->data_offset;
        bio_add_page(bio, page, size, 0);
@@ -846,7 +907,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 }
 EXPORT_SYMBOL_GPL(sync_page_io);
-static int read_disk_sb(struct md_rdev * rdev, int size)
+static int read_disk_sb(mdk_rdev_t * rdev, int size)
 {
        char b[BDEVNAME_SIZE];
        if (!rdev->sb_page) {
@@ -953,7 +1014,7 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
 * We rely on user-space to write the initial superblock, and support
 * reading and updating of superblocks.
 * Interface methods are:
- *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
+ *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
 *      loads and validates a superblock on dev.
 *      if refdev != NULL, compare superblocks on both devices
 *    Return:
@@ -963,13 +1024,13 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
 *     -EINVAL superblock incompatible or invalid
 *     -othererror e.g. -EIO
 *
- *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
+ *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
 *      Verify that dev is acceptable into mddev.
 *       The first time, mddev->raid_disks will be 0, and data from
 *       dev should be merged in.  Subsequent calls check that dev
 *       is new enough.  Return 0 or -EINVAL
 *
- *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
+ *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
 *     Update the superblock for rdev with data in mddev
 *     This does not write to disc.
 *
@@ -978,17 +1039,12 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
 struct super_type  {
        char                *name;
        struct module       *owner;
-        int                 (*load_super)(struct md_rdev *rdev,
+        int                 (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
-                                          struct md_rdev *refdev,
                                          int minor_version);
-        int                 (*validate_super)(struct mddev *mddev,
+        int                 (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
-                                              struct md_rdev *rdev);
+        void                (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
-        void                (*sync_super)(struct mddev *mddev,
+        unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
-                                          struct md_rdev *rdev);
-        unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
                                                sector_t num_sectors);
-        int                 (*allow_new_offset)(struct md_rdev *rdev,
-                                                unsigned long long new_offset);
 };
 /*
@@ -999,7 +1055,7 @@ struct super_type  {
 * has a bitmap. Otherwise, it returns 0.
 *
 */
-int md_check_no_bitmap(struct mddev *mddev)
+int md_check_no_bitmap(mddev_t *mddev)
 {
        if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
                return 0;
@@ -1012,7 +1068,7 @@ EXPORT_SYMBOL(md_check_no_bitmap);
 /*
 * load_super for 0.90.0 
 */
-static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
+static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 {
        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
        mdp_super_t *sb;
@@ -1060,7 +1116,6 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
        rdev->preferred_minor = sb->md_minor;
        rdev->data_offset = 0;
-        rdev->new_data_offset = 0;
        rdev->sb_size = MD_SB_BYTES;
        rdev->badblocks.shift = -1;
@@ -1093,11 +1148,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
                        ret = 0;
        }
        rdev->sectors = rdev->sb_start;
-        /* Limit to 4TB as metadata cannot record more than that.
+        /* Limit to 4TB as metadata cannot record more than that */
-         * (not needed for Linear and RAID0 as metadata doesn't
+        if (rdev->sectors >= (2ULL << 32))
-         * record this size)
-         */
-        if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
                rdev->sectors = (2ULL << 32) - 2;
        if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
@@ -1111,7 +1163,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
 /*
 * validate_super for 0.90.0
 */
-static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        mdp_disk_t *desc;
        mdp_super_t *sb = page_address(rdev->sb_page);
@@ -1137,11 +1189,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
                mddev->dev_sectors = ((sector_t)sb->size) * 2;
                mddev->events = ev1;
                mddev->bitmap_info.offset = 0;
-                mddev->bitmap_info.space = 0;
-                /* bitmap can use 60 K after the 4K superblocks */
                mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
-                mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
-                mddev->reshape_backwards = 0;
                if (mddev->minor_version >= 91) {
                        mddev->reshape_position = sb->reshape_position;
@@ -1149,8 +1197,6 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
                        mddev->new_level = sb->new_level;
                        mddev->new_layout = sb->new_layout;
                        mddev->new_chunk_sectors = sb->new_chunk >> 9;
-                        if (mddev->delta_disks < 0)
-                                mddev->reshape_backwards = 1;
                } else {
                        mddev->reshape_position = MaxSector;
                        mddev->delta_disks = 0;
@@ -1177,12 +1223,9 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
                mddev->max_disks = MD_SB_DISKS;
                if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
-                    mddev->bitmap_info.file == NULL) {
+                    mddev->bitmap_info.file == NULL)
                        mddev->bitmap_info.offset =
                                mddev->bitmap_info.default_offset;
-                        mddev->bitmap_info.space =
-                                mddev->bitmap_info.space;
-                }
        } else if (mddev->pers == NULL) {
                /* Insist on good event counter while assembling, except
@@ -1232,10 +1275,10 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
 /*
 * sync_super for 0.90.0
 */
-static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
+static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        mdp_super_t *sb;
-        struct md_rdev *rdev2;
+        mdk_rdev_t *rdev2;
        int next_spare = mddev->raid_disks;
@@ -1306,7 +1349,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
        sb->disks[0].state = (1<<MD_DISK_REMOVED);
-        rdev_for_each(rdev2, mddev) {
+        list_for_each_entry(rdev2, &mddev->disks, same_set) {
                mdp_disk_t *d;
                int desc_nr;
                int is_active = test_bit(In_sync, &rdev2->flags);
@@ -1376,7 +1419,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
 * rdev_size_change for 0.90.0
 */
 static unsigned long long
-super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
+super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 {
        if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
                return 0; /* component must fit device */
@@ -1388,7 +1431,7 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
        /* Limit to 4TB as metadata cannot record more than that.
         * 4TB == 2^32 KB, or 2*2^32 sectors.
         */
-        if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
+        if (num_sectors >= (2ULL << 32))
                num_sectors = (2ULL << 32) - 2;
        md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
                       rdev->sb_page);
@@ -1396,12 +1439,6 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
        return num_sectors;
 }
-static int
-super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
-{
-        /* non-zero offset changes not possible with v0.90 */
-        return new_offset == 0;
-}
 /*
 * version 1 superblock
@@ -1414,11 +1451,12 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
        unsigned long long newcsum;
        int size = 256 + le32_to_cpu(sb->max_dev)*2;
        __le32 *isuper = (__le32*)sb;
+        int i;
        disk_csum = sb->sb_csum;
        sb->sb_csum = 0;
        newcsum = 0;
-        for (; size >= 4; size -= 4)
+        for (i=0; size>=4; size -= 4 )
                newcsum += le32_to_cpu(*isuper++);
        if (size == 2)
@@ -1431,12 +1469,11 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
                            int acknowledged);
-static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
+static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 {
        struct mdp_superblock_1 *sb;
        int ret;
        sector_t sb_start;
-        sector_t sectors;
        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
        int bmask;
@@ -1491,18 +1528,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
                       bdevname(rdev->bdev,b));
                return -EINVAL;
        }
-        if (sb->pad0 ||
-            sb->pad3[0] ||
-            memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
-                /* Some padding is non-zero, might be a new feature */
-                return -EINVAL;
        rdev->preferred_minor = 0xffff;
        rdev->data_offset = le64_to_cpu(sb->data_offset);
-        rdev->new_data_offset = rdev->data_offset;
-        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
-            (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
-                rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
        atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
        rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
@@ -1513,9 +1541,6 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
        if (minor_version
            && rdev->data_offset < sb_start + (rdev->sb_size/512))
                return -EINVAL;
-        if (minor_version
-            && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
-                return -EINVAL;
        if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
                rdev->desc_nr = -1;
@@ -1587,18 +1612,20 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
                else
                        ret = 0;
        }
-        if (minor_version) {
+        if (minor_version)
-                sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
+                rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
-                sectors -= rdev->data_offset;
+                        le64_to_cpu(sb->data_offset);
-        } else
+        else
-                sectors = rdev->sb_start;
+                rdev->sectors = rdev->sb_start;
-        if (sectors < le64_to_cpu(sb->data_size))
+        if (rdev->sectors < le64_to_cpu(sb->data_size))
                return -EINVAL;
        rdev->sectors = le64_to_cpu(sb->data_size);
+        if (le64_to_cpu(sb->size) > rdev->sectors)
+                return -EINVAL;
        return ret;
 }
-static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
        __u64 ev1 = le64_to_cpu(sb->events);
@@ -1622,37 +1649,17 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
                mddev->dev_sectors = le64_to_cpu(sb->size);
                mddev->events = ev1;
                mddev->bitmap_info.offset = 0;
-                mddev->bitmap_info.space = 0;
-                /* Default location for bitmap is 1K after superblock
-                 * using 3K - total of 4K
-                 */
                mddev->bitmap_info.default_offset = 1024 >> 9;
-                mddev->bitmap_info.default_space = (4096-1024) >> 9;
+                
-                mddev->reshape_backwards = 0;
                mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
                memcpy(mddev->uuid, sb->set_uuid, 16);
                mddev->max_disks =  (4096-256)/2;
                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
-                    mddev->bitmap_info.file == NULL) {
+                    mddev->bitmap_info.file == NULL )
                        mddev->bitmap_info.offset =
                                (__s32)le32_to_cpu(sb->bitmap_offset);
-                        /* Metadata doesn't record how much space is available.
-                         * For 1.0, we assume we can use up to the superblock
-                         * if before, else to 4K beyond superblock.
-                         * For others, assume no change is possible.
-                         */
-                        if (mddev->minor_version > 0)
-                                mddev->bitmap_info.space = 0;
-                        else if (mddev->bitmap_info.offset > 0)
-                                mddev->bitmap_info.space =
-                                        8 - mddev->bitmap_info.offset;
-                        else
-                                mddev->bitmap_info.space =
-                                        -mddev->bitmap_info.offset;
-                }
                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
                        mddev->reshape_position = le64_to_cpu(sb->reshape_position);
@@ -1660,11 +1667,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
                        mddev->new_level = le32_to_cpu(sb->new_level);
                        mddev->new_layout = le32_to_cpu(sb->new_layout);
                        mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
-                        if (mddev->delta_disks < 0 ||
-                            (mddev->delta_disks == 0 &&
-                             (le32_to_cpu(sb->feature_map)
-                              & MD_FEATURE_RESHAPE_BACKWARDS)))
-                                mddev->reshape_backwards = 1;
                } else {
                        mddev->reshape_position = MaxSector;
                        mddev->delta_disks = 0;
@@ -1718,18 +1720,16 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
                }
                if (sb->devflags & WriteMostly1)
                        set_bit(WriteMostly, &rdev->flags);
-                if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
-                        set_bit(Replacement, &rdev->flags);
        } else /* MULTIPATH are always insync */
                set_bit(In_sync, &rdev->flags);
        return 0;
 }
-static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
+static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        struct mdp_superblock_1 *sb;
-        struct md_rdev *rdev2;
+        mdk_rdev_t *rdev2;
        int max_dev, i;
        /* make rdev->sb match mddev and rdev data. */
@@ -1738,6 +1738,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
        sb->feature_map = 0;
        sb->pad0 = 0;
        sb->recovery_offset = cpu_to_le64(0);
+        memset(sb->pad1, 0, sizeof(sb->pad1));
        memset(sb->pad3, 0, sizeof(sb->pad3));
        sb->utime = cpu_to_le64((__u64)mddev->utime);
@@ -1759,8 +1760,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
                sb->devflags |= WriteMostly1;
        else
                sb->devflags &= ~WriteMostly1;
-        sb->data_offset = cpu_to_le64(rdev->data_offset);
-        sb->data_size = cpu_to_le64(rdev->sectors);
        if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
@@ -1774,9 +1773,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
                sb->recovery_offset =
                        cpu_to_le64(rdev->recovery_offset);
        }
-        if (test_bit(Replacement, &rdev->flags))
-                sb->feature_map |=
-                        cpu_to_le32(MD_FEATURE_REPLACEMENT);
        if (mddev->reshape_position != MaxSector) {
                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
@@ -1785,16 +1781,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
                sb->delta_disks = cpu_to_le32(mddev->delta_disks);
                sb->new_level = cpu_to_le32(mddev->new_level);
                sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
-                if (mddev->delta_disks == 0 &&
-                    mddev->reshape_backwards)
-                        sb->feature_map
-                                |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
-                if (rdev->new_data_offset != rdev->data_offset) {
-                        sb->feature_map
-                                |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
-                        sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
-                                                             - rdev->data_offset));
-                }
        }
        if (rdev->badblocks.count == 0)
@@ -1816,23 +1802,23 @@ retry:
                        memset(bbp, 0xff, PAGE_SIZE);
                        for (i = 0 ; i < bb->count ; i++) {
-                                u64 internal_bb = p[i];
+                                u64 internal_bb = *p++;
                                u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
                                                | BB_LEN(internal_bb));
-                                bbp[i] = cpu_to_le64(store_bb);
+                                *bbp++ = cpu_to_le64(store_bb);
                        }
-                        bb->changed = 0;
                        if (read_seqretry(&bb->lock, seq))
                                goto retry;
                        bb->sector = (rdev->sb_start +
                                      (int)le32_to_cpu(sb->bblog_offset));
                        bb->size = le16_to_cpu(sb->bblog_size);
+                        bb->changed = 0;
                }
        }
        max_dev = 0;
-        rdev_for_each(rdev2, mddev)
+        list_for_each_entry(rdev2, &mddev->disks, same_set)
                if (rdev2->desc_nr+1 > max_dev)
                        max_dev = rdev2->desc_nr+1;
@@ -1849,7 +1835,7 @@ retry:
        for (i=0; i<max_dev;i++)
                sb->dev_roles[i] = cpu_to_le16(0xfffe);
        
-        rdev_for_each(rdev2, mddev) {
+        list_for_each_entry(rdev2, &mddev->disks, same_set) {
                i = rdev2->desc_nr;
                if (test_bit(Faulty, &rdev2->flags))
                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1865,14 +1851,12 @@ retry:
 }
 static unsigned long long
-super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
+super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 {
        struct mdp_superblock_1 *sb;
        sector_t max_sectors;
        if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
                return 0; /* component must fit device */
-        if (rdev->data_offset != rdev->new_data_offset)
-                return 0; /* too confusing */
        if (rdev->sb_start < rdev->data_offset) {
                /* minor versions 1 and 2; superblock before data */
                max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
@@ -1900,40 +1884,6 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
                       rdev->sb_page);
        md_super_wait(rdev->mddev);
        return num_sectors;
-}
-static int
-super_1_allow_new_offset(struct md_rdev *rdev,
-                         unsigned long long new_offset)
-{
-        /* All necessary checks on new >= old have been done */
-        struct bitmap *bitmap;
-        if (new_offset >= rdev->data_offset)
-                return 1;
-        /* with 1.0 metadata, there is no metadata to tread on
-         * so we can always move back */
-        if (rdev->mddev->minor_version == 0)
-                return 1;
-        /* otherwise we must be sure not to step on
-         * any metadata, so stay:
-         * 36K beyond start of superblock
-         * beyond end of badblocks
-         * beyond write-intent bitmap
-         */
-        if (rdev->sb_start + (32+4)*2 > new_offset)
-                return 0;
-        bitmap = rdev->mddev->bitmap;
-        if (bitmap && !rdev->mddev->bitmap_info.file &&
-            rdev->sb_start + rdev->mddev->bitmap_info.offset +
-            bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
-                return 0;
-        if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
-                return 0;
-        return 1;
 }
 static struct super_type super_types[] = {
@@ -1944,7 +1894,6 @@ static struct super_type super_types[] = {
                .validate_super     = super_90_validate,
                .sync_super         = super_90_sync,
                .rdev_size_change   = super_90_rdev_size_change,
-                .allow_new_offset   = super_90_allow_new_offset,
        },
        [1] = {
                .name   = "md-1",
@@ -1953,11 +1902,10 @@ static struct super_type super_types[] = {
                .validate_super     = super_1_validate,
                .sync_super         = super_1_sync,
                .rdev_size_change   = super_1_rdev_size_change,
-                .allow_new_offset   = super_1_allow_new_offset,
        },
 };
-static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
+static void sync_super(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        if (mddev->sync_super) {
                mddev->sync_super(mddev, rdev);
@@ -1969,9 +1917,9 @@ static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
        super_types[mddev->major_version].sync_super(mddev, rdev);
 }
-static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
 {
-        struct md_rdev *rdev, *rdev2;
+        mdk_rdev_t *rdev, *rdev2;
        rcu_read_lock();
        rdev_for_each_rcu(rdev, mddev1)
@@ -1994,15 +1942,15 @@ static LIST_HEAD(pending_raid_disks);
 * from the array. It only succeeds if all working and active component devices
 * are integrity capable with matching profiles.
 */
-int md_integrity_register(struct mddev *mddev)
+int md_integrity_register(mddev_t *mddev)
 {
-        struct md_rdev *rdev, *reference = NULL;
+        mdk_rdev_t *rdev, *reference = NULL;
        if (list_empty(&mddev->disks))
                return 0; /* nothing to do */
        if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
                return 0; /* shouldn't register, or already is */
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                /* skip spares and non-functional disks */
                if (test_bit(Faulty, &rdev->flags))
                        continue;
@@ -2041,16 +1989,10 @@ int md_integrity_register(struct mddev *mddev)
 EXPORT_SYMBOL(md_integrity_register);
 /* Disable data integrity if non-capable/non-matching disk is being added */
-void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
+void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 {
-        struct blk_integrity *bi_rdev;
+        struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
-        struct blk_integrity *bi_mddev;
+        struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
-        if (!mddev->gendisk)
-                return;
-        bi_rdev = bdev_get_integrity(rdev->bdev);
-        bi_mddev = blk_get_integrity(mddev->gendisk);
        if (!bi_mddev) /* nothing to do */
                return;
@@ -2064,7 +2006,7 @@ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
 }
 EXPORT_SYMBOL(md_integrity_add_rdev);
-static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev)
+static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 {
        char b[BDEVNAME_SIZE];
        struct kobject *ko;
@@ -2144,12 +2086,12 @@ static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev)
 static void md_delayed_delete(struct work_struct *ws)
 {
-        struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
+        mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
        kobject_del(&rdev->kobj);
        kobject_put(&rdev->kobj);
 }
-static void unbind_rdev_from_array(struct md_rdev * rdev)
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 {
        char b[BDEVNAME_SIZE];
        if (!rdev->mddev) {
@@ -2163,7 +2105,9 @@ static void unbind_rdev_from_array(struct md_rdev * rdev)
        sysfs_remove_link(&rdev->kobj, "block");
        sysfs_put(rdev->sysfs_state);
        rdev->sysfs_state = NULL;
+        kfree(rdev->badblocks.page);
        rdev->badblocks.count = 0;
+        rdev->badblocks.page = NULL;
        /* We need to delay this, otherwise we can deadlock when
         * writing to 'remove' to "dev/state".  We also need
         * to delay it due to rcu usage.
@@ -2179,14 +2123,14 @@ static void unbind_rdev_from_array(struct md_rdev * rdev)
 * otherwise reused by a RAID array (or any other kernel
 * subsystem), by bd_claiming the device.
 */
-static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
+static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
 {
        int err = 0;
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
-                                 shared ? (struct md_rdev *)lock_rdev : rdev);
+                                 shared ? (mdk_rdev_t *)lock_rdev : rdev);
        if (IS_ERR(bdev)) {
                printk(KERN_ERR "md: could not open %s.\n",
                        __bdevname(dev, b));
@@ -2196,7 +2140,7 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
        return err;
 }
-static void unlock_rdev(struct md_rdev *rdev)
+static void unlock_rdev(mdk_rdev_t *rdev)
 {
        struct block_device *bdev = rdev->bdev;
        rdev->bdev = NULL;
@@ -2207,14 +2151,14 @@ static void unlock_rdev(struct md_rdev *rdev)
 void md_autodetect_dev(dev_t dev);
-static void export_rdev(struct md_rdev * rdev)
+static void export_rdev(mdk_rdev_t * rdev)
 {
        char b[BDEVNAME_SIZE];
        printk(KERN_INFO "md: export_rdev(%s)\n",
                bdevname(rdev->bdev,b));
        if (rdev->mddev)
                MD_BUG();
-        md_rdev_clear(rdev);
+        free_disk_sb(rdev);
 #ifndef MODULE
        if (test_bit(AutoDetected, &rdev->flags))
                md_autodetect_dev(rdev->bdev->bd_dev);
@@ -2223,17 +2167,17 @@ static void export_rdev(struct md_rdev * rdev)
        kobject_put(&rdev->kobj);
 }
-static void kick_rdev_from_array(struct md_rdev * rdev)
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
 {
        unbind_rdev_from_array(rdev);
        export_rdev(rdev);
 }
-static void export_array(struct mddev *mddev)
+static void export_array(mddev_t *mddev)
 {
-        struct md_rdev *rdev, *tmp;
+        mdk_rdev_t *rdev, *tmp;
-        rdev_for_each_safe(rdev, tmp, mddev) {
+        rdev_for_each(rdev, tmp, mddev) {
                if (!rdev->mddev) {
                        MD_BUG();
                        continue;
@@ -2327,7 +2271,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
                );
 }
-static void print_rdev(struct md_rdev *rdev, int major_version)
+static void print_rdev(mdk_rdev_t *rdev, int major_version)
 {
        char b[BDEVNAME_SIZE];
        printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
@@ -2351,8 +2295,8 @@ static void print_rdev(struct md_rdev *rdev, int major_version)
 static void md_print_devices(void)
 {
        struct list_head *tmp;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
-        struct mddev *mddev;
+        mddev_t *mddev;
        char b[BDEVNAME_SIZE];
        printk("\n");
@@ -2365,11 +2309,11 @@ static void md_print_devices(void)
                        bitmap_print_sb(mddev->bitmap);
                else
                        printk("%s: ", mdname(mddev));
-                rdev_for_each(rdev, mddev)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
                        printk("<%s>", bdevname(rdev->bdev,b));
                printk("\n");
-                rdev_for_each(rdev, mddev)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
                        print_rdev(rdev, mddev->major_version);
        }
        printk("md:     **********************************\n");
@@ -2377,7 +2321,7 @@ static void md_print_devices(void)
 }
-static void sync_sbs(struct mddev * mddev, int nospares)
+static void sync_sbs(mddev_t * mddev, int nospares)
 {
        /* Update each superblock (in-memory image), but
         * if we are allowed to, skip spares which already
@@ -2385,8 +2329,8 @@ static void sync_sbs(struct mddev * mddev, int nospares)
         * (which would mean they aren't being marked as dirty
         * with the rest of the array)
         */
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (rdev->sb_events == mddev->events ||
                    (nospares &&
                     rdev->raid_disk < 0 &&
@@ -2400,16 +2344,16 @@ static void sync_sbs(struct mddev * mddev, int nospares)
        }
 }
-static void md_update_sb(struct mddev * mddev, int force_change)
+static void md_update_sb(mddev_t * mddev, int force_change)
 {
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        int sync_req;
        int nospares = 0;
        int any_badblocks_changed = 0;
 repeat:
        /* First make sure individual recovery_offsets are correct */
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (rdev->raid_disk >= 0 &&
                    mddev->delta_disks >= 0 &&
                    !test_bit(In_sync, &rdev->flags) &&
@@ -2422,9 +2366,8 @@ repeat:
                clear_bit(MD_CHANGE_DEVS, &mddev->flags);
                if (!mddev->external) {
                        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
-                        rdev_for_each(rdev, mddev) {
+                        list_for_each_entry(rdev, &mddev->disks, same_set) {
                                if (rdev->badblocks.changed) {
-                                        rdev->badblocks.changed = 0;
                                        md_ack_all_badblocks(&rdev->badblocks);
                                        md_error(mddev, rdev);
                                }
@@ -2489,7 +2432,7 @@ repeat:
                mddev->events --;
        }
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (rdev->badblocks.changed)
                        any_badblocks_changed++;
                if (test_bit(Faulty, &rdev->flags))
@@ -2499,24 +2442,27 @@ repeat:
        sync_sbs(mddev, nospares);
        spin_unlock_irq(&mddev->write_lock);
-        pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
+        dprintk(KERN_INFO 
-                 mdname(mddev), mddev->in_sync);
+                "md: updating %s RAID superblock on device (in sync %d)\n",
+                mdname(mddev),mddev->in_sync);
        bitmap_update_sb(mddev->bitmap);
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                char b[BDEVNAME_SIZE];
+                dprintk(KERN_INFO "md: ");
                if (rdev->sb_loaded != 1)
                        continue; /* no noise on spare devices */
+                if (test_bit(Faulty, &rdev->flags))
+                        dprintk("(skipping faulty ");
-                if (!test_bit(Faulty, &rdev->flags) &&
+                dprintk("%s ", bdevname(rdev->bdev,b));
-                    rdev->saved_raid_disk == -1) {
+                if (!test_bit(Faulty, &rdev->flags)) {
                        md_super_write(mddev,rdev,
                                       rdev->sb_start, rdev->sb_size,
                                       rdev->sb_page);
-                        pr_debug("md: (write) %s's sb offset: %llu\n",
+                        dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
-                                 bdevname(rdev->bdev, b),
+                                bdevname(rdev->bdev,b),
-                                 (unsigned long long)rdev->sb_start);
+                                (unsigned long long)rdev->sb_start);
                        rdev->sb_events = mddev->events;
                        if (rdev->badblocks.size) {
                                md_super_write(mddev, rdev,
@@ -2526,12 +2472,8 @@ repeat:
                                rdev->badblocks.size = 0;
                        }
-                } else if (test_bit(Faulty, &rdev->flags))
+                } else
-                        pr_debug("md: %s (skipping faulty)\n",
+                        dprintk(")\n");
-                                 bdevname(rdev->bdev, b));
-                else
-                        pr_debug("(skipping incremental s/r ");
                if (mddev->level == LEVEL_MULTIPATH)
                        /* only need to write one superblock... */
                        break;
@@ -2552,7 +2494,7 @@ repeat:
        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (test_and_clear_bit(FaultRecorded, &rdev->flags))
                        clear_bit(Blocked, &rdev->flags);
@@ -2585,12 +2527,12 @@ static int cmd_match(const char *cmd, const char *str)
 struct rdev_sysfs_entry {
        struct attribute attr;
-        ssize_t (*show)(struct md_rdev *, char *);
+        ssize_t (*show)(mdk_rdev_t *, char *);
-        ssize_t (*store)(struct md_rdev *, const char *, size_t);
+        ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
 };
 static ssize_t
-state_show(struct md_rdev *rdev, char *page)
+state_show(mdk_rdev_t *rdev, char *page)
 {
        char *sep = "";
        size_t len = 0;
@@ -2609,8 +2551,7 @@ state_show(struct md_rdev *rdev, char *page)
                sep = ",";
        }
        if (test_bit(Blocked, &rdev->flags) ||
-            (rdev->badblocks.unacked_exist
+            rdev->badblocks.unacked_exist) {
-             && !test_bit(Faulty, &rdev->flags))) {
                len += sprintf(page+len, "%sblocked", sep);
                sep = ",";
        }
@@ -2623,20 +2564,11 @@ state_show(struct md_rdev *rdev, char *page)
                len += sprintf(page+len, "%swrite_error", sep);
                sep = ",";
        }
-        if (test_bit(WantReplacement, &rdev->flags)) {
-                len += sprintf(page+len, "%swant_replacement", sep);
-                sep = ",";
-        }
-        if (test_bit(Replacement, &rdev->flags)) {
-                len += sprintf(page+len, "%sreplacement", sep);
-                sep = ",";
-        }
        return len+sprintf(page+len, "\n");
 }
 static ssize_t
-state_store(struct md_rdev *rdev, const char *buf, size_t len)
+state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
        /* can write
         *  faulty  - simulates an error
@@ -2660,7 +2592,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
                if (rdev->raid_disk >= 0)
                        err = -EBUSY;
                else {
-                        struct mddev *mddev = rdev->mddev;
+                        mddev_t *mddev = rdev->mddev;
                        kick_rdev_from_array(rdev);
                        if (mddev->pers)
                                md_update_sb(mddev, 1);
@@ -2700,42 +2632,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
        } else if (cmd_match(buf, "-write_error")) {
                clear_bit(WriteErrorSeen, &rdev->flags);
                err = 0;
-        } else if (cmd_match(buf, "want_replacement")) {
-                /* Any non-spare device that is not a replacement can
-                 * become want_replacement at any time, but we then need to
-                 * check if recovery is needed.
-                 */
-                if (rdev->raid_disk >= 0 &&
-                    !test_bit(Replacement, &rdev->flags))
-                        set_bit(WantReplacement, &rdev->flags);
-                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
-                md_wakeup_thread(rdev->mddev->thread);
-                err = 0;
-        } else if (cmd_match(buf, "-want_replacement")) {
-                /* Clearing 'want_replacement' is always allowed.
-                 * Once replacements starts it is too late though.
-                 */
-                err = 0;
-                clear_bit(WantReplacement, &rdev->flags);
-        } else if (cmd_match(buf, "replacement")) {
-                /* Can only set a device as a replacement when array has not
-                 * yet been started.  Once running, replacement is automatic
-                 * from spares, or by assigning 'slot'.
-                 */
-                if (rdev->mddev->pers)
-                        err = -EBUSY;
-                else {
-                        set_bit(Replacement, &rdev->flags);
-                        err = 0;
-                }
-        } else if (cmd_match(buf, "-replacement")) {
-                /* Similarly, can only clear Replacement before start */
-                if (rdev->mddev->pers)
-                        err = -EBUSY;
-                else {
-                        clear_bit(Replacement, &rdev->flags);
-                        err = 0;
-                }
        }
        if (!err)
                sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2745,13 +2641,13 @@ static struct rdev_sysfs_entry rdev_state =
 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
 static ssize_t
-errors_show(struct md_rdev *rdev, char *page)
+errors_show(mdk_rdev_t *rdev, char *page)
 {
        return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
 }
 static ssize_t
-errors_store(struct md_rdev *rdev, const char *buf, size_t len)
+errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
        char *e;
        unsigned long n = simple_strtoul(buf, &e, 10);
@@ -2765,7 +2661,7 @@ static struct rdev_sysfs_entry rdev_errors =
 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
 static ssize_t
-slot_show(struct md_rdev *rdev, char *page)
+slot_show(mdk_rdev_t *rdev, char *page)
 {
        if (rdev->raid_disk < 0)
                return sprintf(page, "none\n");
@@ -2774,7 +2670,7 @@ slot_show(struct md_rdev *rdev, char *page)
 }
 static ssize_t
-slot_store(struct md_rdev *rdev, const char *buf, size_t len)
+slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
        char *e;
        int err;
@@ -2797,7 +2693,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
                if (rdev->mddev->pers->hot_remove_disk == NULL)
                        return -EINVAL;
                err = rdev->mddev->pers->
-                        hot_remove_disk(rdev->mddev, rdev);
+                        hot_remove_disk(rdev->mddev, rdev->raid_disk);
                if (err)
                        return err;
                sysfs_unlink_rdev(rdev->mddev, rdev);
@@ -2805,6 +2701,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
                md_wakeup_thread(rdev->mddev->thread);
        } else if (rdev->mddev->pers) {
+                mdk_rdev_t *rdev2;
                /* Activating a spare .. or possibly reactivating
                 * if we ever get bitmaps working here.
                 */
@@ -2818,6 +2715,10 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
                if (rdev->mddev->pers->hot_add_disk == NULL)
                        return -EINVAL;
+                list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
+                        if (rdev2->raid_disk == slot)
+                                return -EEXIST;
                if (slot >= rdev->mddev->raid_disks &&
                    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
                        return -ENOSPC;
@@ -2827,7 +2728,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
                        rdev->saved_raid_disk = slot;
                else
                        rdev->saved_raid_disk = -1;
-                clear_bit(In_sync, &rdev->flags);
                err = rdev->mddev->pers->
                        hot_add_disk(rdev->mddev, rdev);
                if (err) {
@@ -2857,16 +2757,17 @@ static struct rdev_sysfs_entry rdev_slot =
 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
 static ssize_t
-offset_show(struct md_rdev *rdev, char *page)
+offset_show(mdk_rdev_t *rdev, char *page)
 {
        return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
 }
 static ssize_t
-offset_store(struct md_rdev *rdev, const char *buf, size_t len)
+offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
-        unsigned long long offset;
+        char *e;
-        if (strict_strtoull(buf, 10, &offset) < 0)
+        unsigned long long offset = simple_strtoull(buf, &e, 10);
+        if (e==buf || (*e && *e != '\n'))
                return -EINVAL;
        if (rdev->mddev->pers && rdev->raid_disk >= 0)
                return -EBUSY;
@@ -2875,72 +2776,14 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len)
                 * can be sane */
                return -EBUSY;
        rdev->data_offset = offset;
-        rdev->new_data_offset = offset;
        return len;
 }
 static struct rdev_sysfs_entry rdev_offset =
 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
-static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
-{
-        return sprintf(page, "%llu\n",
-                       (unsigned long long)rdev->new_data_offset);
-}
-static ssize_t new_offset_store(struct md_rdev *rdev,
-                                const char *buf, size_t len)
-{
-        unsigned long long new_offset;
-        struct mddev *mddev = rdev->mddev;
-        if (strict_strtoull(buf, 10, &new_offset) < 0)
-                return -EINVAL;
-        if (mddev->sync_thread)
-                return -EBUSY;
-        if (new_offset == rdev->data_offset)
-                /* reset is always permitted */
-                ;
-        else if (new_offset > rdev->data_offset) {
-                /* must not push array size beyond rdev_sectors */
-                if (new_offset - rdev->data_offset
-                    + mddev->dev_sectors > rdev->sectors)
-                                return -E2BIG;
-        }
-        /* Metadata worries about other space details. */
-        /* decreasing the offset is inconsistent with a backwards
-         * reshape.
-         */
-        if (new_offset < rdev->data_offset &&
-            mddev->reshape_backwards)
-                return -EINVAL;
-        /* Increasing offset is inconsistent with forwards
-         * reshape.  reshape_direction should be set to
-         * 'backwards' first.
-         */
-        if (new_offset > rdev->data_offset &&
-            !mddev->reshape_backwards)
-                return -EINVAL;
-        if (mddev->pers && mddev->persistent &&
-            !super_types[mddev->major_version]
-            .allow_new_offset(rdev, new_offset))
-                return -E2BIG;
-        rdev->new_data_offset = new_offset;
-        if (new_offset > rdev->data_offset)
-                mddev->reshape_backwards = 1;
-        else if (new_offset < rdev->data_offset)
-                mddev->reshape_backwards = 0;
-        return len;
-}
-static struct rdev_sysfs_entry rdev_new_offset =
-__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
 static ssize_t
-rdev_size_show(struct md_rdev *rdev, char *page)
+rdev_size_show(mdk_rdev_t *rdev, char *page)
 {
        return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
 }
@@ -2975,16 +2818,14 @@ static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
 }
 static ssize_t
-rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
+rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
-        struct mddev *my_mddev = rdev->mddev;
+        mddev_t *my_mddev = rdev->mddev;
        sector_t oldsectors = rdev->sectors;
        sector_t sectors;
        if (strict_blocks_to_sectors(buf, &sectors) < 0)
                return -EINVAL;
-        if (rdev->data_offset != rdev->new_data_offset)
-                return -EINVAL; /* too confusing */
        if (my_mddev->pers && rdev->raid_disk >= 0) {
                if (my_mddev->persistent) {
                        sectors = super_types[my_mddev->major_version].
@@ -3005,16 +2846,16 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
                 * a deadlock.  We have already changed rdev->sectors, and if
                 * we have to change it back, we will have the lock again.
                 */
-                struct mddev *mddev;
+                mddev_t *mddev;
                int overlap = 0;
                struct list_head *tmp;
                mddev_unlock(my_mddev);
                for_each_mddev(mddev, tmp) {
-                        struct md_rdev *rdev2;
+                        mdk_rdev_t *rdev2;
                        mddev_lock(mddev);
-                        rdev_for_each(rdev2, mddev)
+                        list_for_each_entry(rdev2, &mddev->disks, same_set)
                                if (rdev->bdev == rdev2->bdev &&
                                    rdev != rdev2 &&
                                    overlaps(rdev->data_offset, rdev->sectors,
@@ -3048,7 +2889,7 @@ static struct rdev_sysfs_entry rdev_size =
 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
-static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
+static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page)
 {
        unsigned long long recovery_start = rdev->recovery_offset;
@@ -3059,7 +2900,7 @@ static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
        return sprintf(page, "%llu\n", recovery_start);
 }
-static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
+static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
        unsigned long long recovery_start;
@@ -3089,11 +2930,11 @@ badblocks_show(struct badblocks *bb, char *page, int unack);
 static ssize_t
 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
-static ssize_t bb_show(struct md_rdev *rdev, char *page)
+static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
 {
        return badblocks_show(&rdev->badblocks, page, 0);
 }
-static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
+static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
 {
        int rv = badblocks_store(&rdev->badblocks, page, len, 0);
        /* Maybe that ack was all we needed */
@@ -3105,11 +2946,11 @@ static struct rdev_sysfs_entry rdev_bad_blocks =
 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
-static ssize_t ubb_show(struct md_rdev *rdev, char *page)
+static ssize_t ubb_show(mdk_rdev_t *rdev, char *page)
 {
        return badblocks_show(&rdev->badblocks, page, 1);
 }
-static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
+static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len)
 {
        return badblocks_store(&rdev->badblocks, page, len, 1);
 }
@@ -3121,7 +2962,6 @@ static struct attribute *rdev_default_attrs[] = {
        &rdev_errors.attr,
        &rdev_slot.attr,
        &rdev_offset.attr,
-        &rdev_new_offset.attr,
        &rdev_size.attr,
        &rdev_recovery_start.attr,
        &rdev_bad_blocks.attr,
@@ -3132,8 +2972,8 @@ static ssize_t
 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
-        struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
+        mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
-        struct mddev *mddev = rdev->mddev;
+        mddev_t *mddev = rdev->mddev;
        ssize_t rv;
        if (!entry->show)
@@ -3155,9 +2995,9 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
              const char *page, size_t length)
 {
        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
-        struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
+        mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
        ssize_t rv;
-        struct mddev *mddev = rdev->mddev;
+        mddev_t *mddev = rdev->mddev;
        if (!entry->store)
                return -EIO;
@@ -3176,7 +3016,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
 static void rdev_free(struct kobject *ko)
 {
-        struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
+        mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
        kfree(rdev);
 }
 static const struct sysfs_ops rdev_sysfs_ops = {
@@ -3189,14 +3029,13 @@ static struct kobj_type rdev_ktype = {
        .default_attrs  = rdev_default_attrs,
 };
-int md_rdev_init(struct md_rdev *rdev)
+int md_rdev_init(mdk_rdev_t *rdev)
 {
        rdev->desc_nr = -1;
        rdev->saved_raid_disk = -1;
        rdev->raid_disk = -1;
        rdev->flags = 0;
        rdev->data_offset = 0;
-        rdev->new_data_offset = 0;
        rdev->sb_events = 0;
        rdev->last_read_error.tv_sec  = 0;
        rdev->last_read_error.tv_nsec = 0;
@@ -3233,11 +3072,11 @@ EXPORT_SYMBOL_GPL(md_rdev_init);
 *
 * a faulty rdev _never_ has rdev->sb set.
 */
-static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
+static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
 {
        char b[BDEVNAME_SIZE];
        int err;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        sector_t size;
        rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
@@ -3295,7 +3134,8 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
 abort_free:
        if (rdev->bdev)
                unlock_rdev(rdev);
-        md_rdev_clear(rdev);
+        free_disk_sb(rdev);
+        kfree(rdev->badblocks.page);
        kfree(rdev);
        return ERR_PTR(err);
 }
@@ -3305,14 +3145,14 @@ abort_free:
 */
-static void analyze_sbs(struct mddev * mddev)
+static void analyze_sbs(mddev_t * mddev)
 {
        int i;
-        struct md_rdev *rdev, *freshest, *tmp;
+        mdk_rdev_t *rdev, *freshest, *tmp;
        char b[BDEVNAME_SIZE];
        freshest = NULL;
-        rdev_for_each_safe(rdev, tmp, mddev)
+        rdev_for_each(rdev, tmp, mddev)
                switch (super_types[mddev->major_version].
                        load_super(rdev, freshest, mddev->minor_version)) {
                case 1:
@@ -3333,7 +3173,7 @@ static void analyze_sbs(struct mddev * mddev)
                validate_super(mddev, freshest);
        i = 0;
-        rdev_for_each_safe(rdev, tmp, mddev) {
+        rdev_for_each(rdev, tmp, mddev) {
                if (mddev->max_disks &&
                    (rdev->desc_nr >= mddev->max_disks ||
                     i > mddev->max_disks)) {
@@ -3408,13 +3248,13 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
 static void md_safemode_timeout(unsigned long data);
 static ssize_t
-safe_delay_show(struct mddev *mddev, char *page)
+safe_delay_show(mddev_t *mddev, char *page)
 {
        int msec = (mddev->safemode_delay*1000)/HZ;
        return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
 }
 static ssize_t
-safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
+safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
 {
        unsigned long msec;
@@ -3436,9 +3276,9 @@ static struct md_sysfs_entry md_safe_delay =
 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
 static ssize_t
-level_show(struct mddev *mddev, char *page)
+level_show(mddev_t *mddev, char *page)
 {
-        struct md_personality *p = mddev->pers;
+        struct mdk_personality *p = mddev->pers;
        if (p)
                return sprintf(page, "%s\n", p->name);
        else if (mddev->clevel[0])
@@ -3450,14 +3290,14 @@ level_show(struct mddev *mddev, char *page)
 }
 static ssize_t
-level_store(struct mddev *mddev, const char *buf, size_t len)
+level_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char clevel[16];
        ssize_t rv = len;
-        struct md_personality *pers;
+        struct mdk_personality *pers;
        long level;
        void *priv;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        if (mddev->pers == NULL) {
                if (len == 0)
@@ -3522,7 +3362,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                return -EINVAL;
        }
-        rdev_for_each(rdev, mddev)
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                rdev->new_raid_disk = rdev->raid_disk;
        /* ->takeover must set new_* and/or delta_disks
@@ -3535,7 +3375,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                mddev->new_chunk_sectors = mddev->chunk_sectors;
                mddev->raid_disks -= mddev->delta_disks;
                mddev->delta_disks = 0;
-                mddev->reshape_backwards = 0;
                module_put(pers->owner);
                printk(KERN_WARNING "md: %s: %s would not accept array\n",
                       mdname(mddev), clevel);
@@ -3576,7 +3415,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                mddev->safemode = 0;
        }
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (rdev->raid_disk < 0)
                        continue;
                if (rdev->new_raid_disk >= mddev->raid_disks)
@@ -3585,7 +3424,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                        continue;
                sysfs_unlink_rdev(mddev, rdev);
        }
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (rdev->raid_disk < 0)
                        continue;
                if (rdev->new_raid_disk == rdev->raid_disk)
@@ -3609,7 +3448,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
        mddev->layout = mddev->new_layout;
        mddev->chunk_sectors = mddev->new_chunk_sectors;
        mddev->delta_disks = 0;
-        mddev->reshape_backwards = 0;
        mddev->degraded = 0;
        if (mddev->pers->sync_request == NULL) {
                /* this is now an array without redundancy, so
@@ -3619,8 +3457,10 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                del_timer_sync(&mddev->safemode_timer);
        }
        pers->run(mddev);
-        set_bit(MD_CHANGE_DEVS, &mddev->flags);
        mddev_resume(mddev);
+        set_bit(MD_CHANGE_DEVS, &mddev->flags);
+        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+        md_wakeup_thread(mddev->thread);
        sysfs_notify(&mddev->kobj, NULL, "level");
        md_new_event(mddev);
        return rv;
@@ -3631,7 +3471,7 @@ __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
 static ssize_t
-layout_show(struct mddev *mddev, char *page)
+layout_show(mddev_t *mddev, char *page)
 {
        /* just a number, not meaningful for all levels */
        if (mddev->reshape_position != MaxSector &&
@@ -3642,7 +3482,7 @@ layout_show(struct mddev *mddev, char *page)
 }
 static ssize_t
-layout_store(struct mddev *mddev, const char *buf, size_t len)
+layout_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        unsigned long n = simple_strtoul(buf, &e, 10);
@@ -3672,7 +3512,7 @@ __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
 static ssize_t
-raid_disks_show(struct mddev *mddev, char *page)
+raid_disks_show(mddev_t *mddev, char *page)
 {
        if (mddev->raid_disks == 0)
                return 0;
@@ -3683,10 +3523,10 @@ raid_disks_show(struct mddev *mddev, char *page)
        return sprintf(page, "%d\n", mddev->raid_disks);
 }
-static int update_raid_disks(struct mddev *mddev, int raid_disks);
+static int update_raid_disks(mddev_t *mddev, int raid_disks);
 static ssize_t
-raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
+raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        int rv = 0;
@@ -3698,20 +3538,9 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
        if (mddev->pers)
                rv = update_raid_disks(mddev, n);
        else if (mddev->reshape_position != MaxSector) {
-                struct md_rdev *rdev;
                int olddisks = mddev->raid_disks - mddev->delta_disks;
-                rdev_for_each(rdev, mddev) {
-                        if (olddisks < n &&
-                            rdev->data_offset < rdev->new_data_offset)
-                                return -EINVAL;
-                        if (olddisks > n &&
-                            rdev->data_offset > rdev->new_data_offset)
-                                return -EINVAL;
-                }
                mddev->delta_disks = n - olddisks;
                mddev->raid_disks = n;
-                mddev->reshape_backwards = (mddev->delta_disks < 0);
        } else
                mddev->raid_disks = n;
        return rv ? rv : len;
@@ -3720,7 +3549,7 @@ static struct md_sysfs_entry md_raid_disks =
 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
 static ssize_t
-chunk_size_show(struct mddev *mddev, char *page)
+chunk_size_show(mddev_t *mddev, char *page)
 {
        if (mddev->reshape_position != MaxSector &&
            mddev->chunk_sectors != mddev->new_chunk_sectors)
@@ -3731,7 +3560,7 @@ chunk_size_show(struct mddev *mddev, char *page)
 }
 static ssize_t
-chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
+chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        unsigned long n = simple_strtoul(buf, &e, 10);
@@ -3760,7 +3589,7 @@ static struct md_sysfs_entry md_chunk_size =
 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
 static ssize_t
-resync_start_show(struct mddev *mddev, char *page)
+resync_start_show(mddev_t *mddev, char *page)
 {
        if (mddev->recovery_cp == MaxSector)
                return sprintf(page, "none\n");
@@ -3768,7 +3597,7 @@ resync_start_show(struct mddev *mddev, char *page)
 }
 static ssize_t
-resync_start_store(struct mddev *mddev, const char *buf, size_t len)
+resync_start_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        unsigned long long n = simple_strtoull(buf, &e, 10);
@@ -3781,8 +3610,6 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len)
                return -EINVAL;
        mddev->recovery_cp = n;
-        if (mddev->pers)
-                set_bit(MD_CHANGE_CLEAN, &mddev->flags);
        return len;
 }
 static struct md_sysfs_entry md_resync_start =
@@ -3840,7 +3667,7 @@ static int match_word(const char *word, char **list)
 }
 static ssize_t
-array_state_show(struct mddev *mddev, char *page)
+array_state_show(mddev_t *mddev, char *page)
 {
        enum array_state st = inactive;
@@ -3873,13 +3700,13 @@ array_state_show(struct mddev *mddev, char *page)
        return sprintf(page, "%s\n", array_states[st]);
 }
-static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev);
+static int do_md_stop(mddev_t * mddev, int ro, int is_open);
-static int md_set_readonly(struct mddev * mddev, struct block_device *bdev);
+static int md_set_readonly(mddev_t * mddev, int is_open);
-static int do_md_run(struct mddev * mddev);
+static int do_md_run(mddev_t * mddev);
-static int restart_array(struct mddev *mddev);
+static int restart_array(mddev_t *mddev);
 static ssize_t
-array_state_store(struct mddev *mddev, const char *buf, size_t len)
+array_state_store(mddev_t *mddev, const char *buf, size_t len)
 {
        int err = -EINVAL;
        enum array_state st = match_word(buf, array_states);
@@ -3888,20 +3715,24 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
                break;
        case clear:
                /* stopping an active array */
-                err = do_md_stop(mddev, 0, NULL);
+                if (atomic_read(&mddev->openers) > 0)
+                        return -EBUSY;
+                err = do_md_stop(mddev, 0, 0);
                break;
        case inactive:
                /* stopping an active array */
-                if (mddev->pers)
+                if (mddev->pers) {
-                        err = do_md_stop(mddev, 2, NULL);
+                        if (atomic_read(&mddev->openers) > 0)
-                else
+                                return -EBUSY;
+                        err = do_md_stop(mddev, 2, 0);
+                } else
                        err = 0; /* already inactive */
                break;
        case suspended:
                break; /* not supported yet */
        case readonly:
                if (mddev->pers)
-                        err = md_set_readonly(mddev, NULL);
+                        err = md_set_readonly(mddev, 0);
                else {
                        mddev->ro = 1;
                        set_disk_ro(mddev->gendisk, 1);
@@ -3911,7 +3742,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
        case read_auto:
                if (mddev->pers) {
                        if (mddev->ro == 0)
-                                err = md_set_readonly(mddev, NULL);
+                                err = md_set_readonly(mddev, 0);
                        else if (mddev->ro == 1)
                                err = restart_array(mddev);
                        if (err == 0) {
@@ -3961,8 +3792,6 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
        if (err)
                return err;
        else {
-                if (mddev->hold_active == UNTIL_IOCTL)
-                        mddev->hold_active = 0;
                sysfs_notify_dirent_safe(mddev->sysfs_state);
                return len;
        }
@@ -3971,13 +3800,13 @@ static struct md_sysfs_entry md_array_state =
 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
 static ssize_t
-max_corrected_read_errors_show(struct mddev *mddev, char *page) {
+max_corrected_read_errors_show(mddev_t *mddev, char *page) {
        return sprintf(page, "%d\n",
                       atomic_read(&mddev->max_corr_read_errors));
 }
 static ssize_t
-max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
+max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        unsigned long n = simple_strtoul(buf, &e, 10);
@@ -3994,13 +3823,13 @@ __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
        max_corrected_read_errors_store);
 static ssize_t
-null_show(struct mddev *mddev, char *page)
+null_show(mddev_t *mddev, char *page)
 {
        return -EINVAL;
 }
 static ssize_t
-new_dev_store(struct mddev *mddev, const char *buf, size_t len)
+new_dev_store(mddev_t *mddev, const char *buf, size_t len)
 {
        /* buf must be %d:%d\n? giving major and minor numbers */
        /* The new device is added to the array.
@@ -4013,7 +3842,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
        int major = simple_strtoul(buf, &e, 10);
        int minor;
        dev_t dev;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        int err;
        if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
@@ -4031,9 +3860,8 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
                rdev = md_import_device(dev, mddev->major_version,
                                        mddev->minor_version);
                if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
-                        struct md_rdev *rdev0
+                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
-                                = list_entry(mddev->disks.next,
+                                                       mdk_rdev_t, same_set);
-                                             struct md_rdev, same_set);
                        err = super_types[mddev->major_version]
                                .load_super(rdev, rdev0, mddev->minor_version);
                        if (err < 0)
@@ -4057,7 +3885,7 @@ static struct md_sysfs_entry md_new_device =
 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
 static ssize_t
-bitmap_store(struct mddev *mddev, const char *buf, size_t len)
+bitmap_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *end;
        unsigned long chunk, end_chunk;
@@ -4086,16 +3914,16 @@ static struct md_sysfs_entry md_bitmap =
 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
 static ssize_t
-size_show(struct mddev *mddev, char *page)
+size_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%llu\n",
                (unsigned long long)mddev->dev_sectors / 2);
 }
-static int update_size(struct mddev *mddev, sector_t num_sectors);
+static int update_size(mddev_t *mddev, sector_t num_sectors);
 static ssize_t
-size_store(struct mddev *mddev, const char *buf, size_t len)
+size_store(mddev_t *mddev, const char *buf, size_t len)
 {
        /* If array is inactive, we can reduce the component size, but
         * not increase it (except from 0).
@@ -4123,14 +3951,14 @@ static struct md_sysfs_entry md_size =
 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
-/* Metadata version.
+/* Metdata version.
 * This is one of
 *   'none' for arrays with no metadata (good luck...)
 *   'external' for arrays with externally managed metadata,
 * or N.M for internally known formats
 */
 static ssize_t
-metadata_show(struct mddev *mddev, char *page)
+metadata_show(mddev_t *mddev, char *page)
 {
        if (mddev->persistent)
                return sprintf(page, "%d.%d\n",
@@ -4142,7 +3970,7 @@ metadata_show(struct mddev *mddev, char *page)
 }
 static ssize_t
-metadata_store(struct mddev *mddev, const char *buf, size_t len)
+metadata_store(mddev_t *mddev, const char *buf, size_t len)
 {
        int major, minor;
        char *e;
@@ -4196,7 +4024,7 @@ static struct md_sysfs_entry md_metadata =
 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
 static ssize_t
-action_show(struct mddev *mddev, char *page)
+action_show(mddev_t *mddev, char *page)
 {
        char *type = "idle";
        if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
@@ -4218,10 +4046,10 @@ action_show(struct mddev *mddev, char *page)
        return sprintf(page, "%s\n", type);
 }
-static void reap_sync_thread(struct mddev *mddev);
+static void reap_sync_thread(mddev_t *mddev);
 static ssize_t
-action_store(struct mddev *mddev, const char *page, size_t len)
+action_store(mddev_t *mddev, const char *page, size_t len)
 {
        if (!mddev->pers || !mddev->pers->sync_request)
                return -EINVAL;
@@ -4260,13 +4088,6 @@ action_store(struct mddev *mddev, const char *page, size_t len)
                set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
                set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
        }
-        if (mddev->ro == 2) {
-                /* A write to sync_action is enough to justify
-                 * canceling read-auto mode
-                 */
-                mddev->ro = 0;
-                md_wakeup_thread(mddev->sync_thread);
-        }
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
        sysfs_notify_dirent_safe(mddev->sysfs_action);
@@ -4274,11 +4095,10 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 }
 static ssize_t
-mismatch_cnt_show(struct mddev *mddev, char *page)
+mismatch_cnt_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%llu\n",
-                       (unsigned long long)
+                       (unsigned long long) mddev->resync_mismatches);
-                       atomic64_read(&mddev->resync_mismatches));
 }
 static struct md_sysfs_entry md_scan_mode =
@@ -4288,14 +4108,14 @@ __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
 static ssize_t
-sync_min_show(struct mddev *mddev, char *page)
+sync_min_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%d (%s)\n", speed_min(mddev),
                       mddev->sync_speed_min ? "local": "system");
 }
 static ssize_t
-sync_min_store(struct mddev *mddev, const char *buf, size_t len)
+sync_min_store(mddev_t *mddev, const char *buf, size_t len)
 {
        int min;
        char *e;
@@ -4314,14 +4134,14 @@ static struct md_sysfs_entry md_sync_min =
 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
 static ssize_t
-sync_max_show(struct mddev *mddev, char *page)
+sync_max_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%d (%s)\n", speed_max(mddev),
                       mddev->sync_speed_max ? "local": "system");
 }
 static ssize_t
-sync_max_store(struct mddev *mddev, const char *buf, size_t len)
+sync_max_store(mddev_t *mddev, const char *buf, size_t len)
 {
        int max;
        char *e;
@@ -4340,20 +4160,20 @@ static struct md_sysfs_entry md_sync_max =
 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
 static ssize_t
-degraded_show(struct mddev *mddev, char *page)
+degraded_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%d\n", mddev->degraded);
 }
 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
 static ssize_t
-sync_force_parallel_show(struct mddev *mddev, char *page)
+sync_force_parallel_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%d\n", mddev->parallel_resync);
 }
 static ssize_t
-sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
+sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
 {
        long n;
@@ -4377,7 +4197,7 @@ __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
       sync_force_parallel_show, sync_force_parallel_store);
 static ssize_t
-sync_speed_show(struct mddev *mddev, char *page)
+sync_speed_show(mddev_t *mddev, char *page)
 {
        unsigned long resync, dt, db;
        if (mddev->curr_resync == 0)
@@ -4392,19 +4212,14 @@ sync_speed_show(struct mddev *mddev, char *page)
 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
 static ssize_t
-sync_completed_show(struct mddev *mddev, char *page)
+sync_completed_show(mddev_t *mddev, char *page)
 {
        unsigned long long max_sectors, resync;
        if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                return sprintf(page, "none\n");
-        if (mddev->curr_resync == 1 ||
+        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-            mddev->curr_resync == 2)
-                return sprintf(page, "delayed\n");
-        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
-            test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
                max_sectors = mddev->resync_max_sectors;
        else
                max_sectors = mddev->dev_sectors;
@@ -4416,13 +4231,13 @@ sync_completed_show(struct mddev *mddev, char *page)
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
 static ssize_t
-min_sync_show(struct mddev *mddev, char *page)
+min_sync_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%llu\n",
                       (unsigned long long)mddev->resync_min);
 }
 static ssize_t
-min_sync_store(struct mddev *mddev, const char *buf, size_t len)
+min_sync_store(mddev_t *mddev, const char *buf, size_t len)
 {
        unsigned long long min;
        if (strict_strtoull(buf, 10, &min))
@@ -4447,7 +4262,7 @@ static struct md_sysfs_entry md_min_sync =
 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
 static ssize_t
-max_sync_show(struct mddev *mddev, char *page)
+max_sync_show(mddev_t *mddev, char *page)
 {
        if (mddev->resync_max == MaxSector)
                return sprintf(page, "max\n");
@@ -4456,7 +4271,7 @@ max_sync_show(struct mddev *mddev, char *page)
                               (unsigned long long)mddev->resync_max);
 }
 static ssize_t
-max_sync_store(struct mddev *mddev, const char *buf, size_t len)
+max_sync_store(mddev_t *mddev, const char *buf, size_t len)
 {
        if (strncmp(buf, "max", 3) == 0)
                mddev->resync_max = MaxSector;
@@ -4487,13 +4302,13 @@ static struct md_sysfs_entry md_max_sync =
 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
 static ssize_t
-suspend_lo_show(struct mddev *mddev, char *page)
+suspend_lo_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
 }
 static ssize_t
-suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
+suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        unsigned long long new = simple_strtoull(buf, &e, 10);
@@ -4521,13 +4336,13 @@ __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
 static ssize_t
-suspend_hi_show(struct mddev *mddev, char *page)
+suspend_hi_show(mddev_t *mddev, char *page)
 {
        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
 }
 static ssize_t
-suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
+suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
 {
        char *e;
        unsigned long long new = simple_strtoull(buf, &e, 10);
@@ -4554,7 +4369,7 @@ static struct md_sysfs_entry md_suspend_hi =
 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
 static ssize_t
-reshape_position_show(struct mddev *mddev, char *page)
+reshape_position_show(mddev_t *mddev, char *page)
 {
        if (mddev->reshape_position != MaxSector)
                return sprintf(page, "%llu\n",
@@ -4564,9 +4379,8 @@ reshape_position_show(struct mddev *mddev, char *page)
 }
 static ssize_t
-reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
+reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
 {
-        struct md_rdev *rdev;
        char *e;
        unsigned long long new = simple_strtoull(buf, &e, 10);
        if (mddev->pers)
@@ -4575,12 +4389,9 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
                return -EINVAL;
        mddev->reshape_position = new;
        mddev->delta_disks = 0;
-        mddev->reshape_backwards = 0;
        mddev->new_level = mddev->level;
        mddev->new_layout = mddev->layout;
        mddev->new_chunk_sectors = mddev->chunk_sectors;
-        rdev_for_each(rdev, mddev)
-                rdev->new_data_offset = rdev->data_offset;
        return len;
 }
@@ -4589,43 +4400,7 @@ __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
       reshape_position_store);
 static ssize_t
-reshape_direction_show(struct mddev *mddev, char *page)
+array_size_show(mddev_t *mddev, char *page)
-{
-        return sprintf(page, "%s\n",
-                       mddev->reshape_backwards ? "backwards" : "forwards");
-}
-static ssize_t
-reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
-{
-        int backwards = 0;
-        if (cmd_match(buf, "forwards"))
-                backwards = 0;
-        else if (cmd_match(buf, "backwards"))
-                backwards = 1;
-        else
-                return -EINVAL;
-        if (mddev->reshape_backwards == backwards)
-                return len;
-        /* check if we are allowed to change */
-        if (mddev->delta_disks)
-                return -EBUSY;
-        if (mddev->persistent &&
-            mddev->major_version == 0)
-                return -EINVAL;
-        mddev->reshape_backwards = backwards;
-        return len;
-}
-static struct md_sysfs_entry md_reshape_direction =
-__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
-       reshape_direction_store);
-static ssize_t
-array_size_show(struct mddev *mddev, char *page)
 {
        if (mddev->external_size)
                return sprintf(page, "%llu\n",
@@ -4635,7 +4410,7 @@ array_size_show(struct mddev *mddev, char *page)
 }
 static ssize_t
-array_size_store(struct mddev *mddev, const char *buf, size_t len)
+array_size_store(mddev_t *mddev, const char *buf, size_t len)
 {
        sector_t sectors;
@@ -4679,7 +4454,6 @@ static struct attribute *md_default_attrs[] = {
        &md_safe_delay.attr,
        &md_array_state.attr,
        &md_reshape_position.attr,
-        &md_reshape_direction.attr,
        &md_array_size.attr,
        &max_corr_read_errors.attr,
        NULL,
@@ -4711,25 +4485,16 @@ static ssize_t
 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
-        struct mddev *mddev = container_of(kobj, struct mddev, kobj);
+        mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
        ssize_t rv;
        if (!entry->show)
                return -EIO;
-        spin_lock(&all_mddevs_lock);
-        if (list_empty(&mddev->all_mddevs)) {
-                spin_unlock(&all_mddevs_lock);
-                return -EBUSY;
-        }
-        mddev_get(mddev);
-        spin_unlock(&all_mddevs_lock);
        rv = mddev_lock(mddev);
        if (!rv) {
                rv = entry->show(mddev, page);
                mddev_unlock(mddev);
        }
-        mddev_put(mddev);
        return rv;
 }
@@ -4738,34 +4503,26 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
              const char *page, size_t length)
 {
        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
-        struct mddev *mddev = container_of(kobj, struct mddev, kobj);
+        mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
        ssize_t rv;
        if (!entry->store)
                return -EIO;
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
-        spin_lock(&all_mddevs_lock);
-        if (list_empty(&mddev->all_mddevs)) {
-                spin_unlock(&all_mddevs_lock);
-                return -EBUSY;
-        }
-        mddev_get(mddev);
-        spin_unlock(&all_mddevs_lock);
-        if (entry->store == new_dev_store)
-                flush_workqueue(md_misc_wq);
        rv = mddev_lock(mddev);
+        if (mddev->hold_active == UNTIL_IOCTL)
+                mddev->hold_active = 0;
        if (!rv) {
                rv = entry->store(mddev, page, length);
                mddev_unlock(mddev);
        }
-        mddev_put(mddev);
        return rv;
 }
 static void md_free(struct kobject *ko)
 {
-        struct mddev *mddev = container_of(ko, struct mddev, kobj);
+        mddev_t *mddev = container_of(ko, mddev_t, kobj);
        if (mddev->sysfs_state)
                sysfs_put(mddev->sysfs_state);
@@ -4794,7 +4551,7 @@ int mdp_major = 0;
 static void mddev_delayed_delete(struct work_struct *ws)
 {
-        struct mddev *mddev = container_of(ws, struct mddev, del_work);
+        mddev_t *mddev = container_of(ws, mddev_t, del_work);
        sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
        kobject_del(&mddev->kobj);
@@ -4804,7 +4561,7 @@ static void mddev_delayed_delete(struct work_struct *ws)
 static int md_alloc(dev_t dev, char *name)
 {
        static DEFINE_MUTEX(disks_mutex);
-        struct mddev *mddev = mddev_find(dev);
+        mddev_t *mddev = mddev_find(dev);
        struct gendisk *disk;
        int partitioned;
        int shift;
@@ -4831,7 +4588,7 @@ static int md_alloc(dev_t dev, char *name)
        if (name) {
                /* Need to ensure that 'name' is not a duplicate.
                 */
-                struct mddev *mddev2;
+                mddev_t *mddev2;
                spin_lock(&all_mddevs_lock);
                list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
@@ -4850,7 +4607,6 @@ static int md_alloc(dev_t dev, char *name)
        mddev->queue->queuedata = mddev;
        blk_queue_make_request(mddev->queue, md_make_request);
-        blk_set_stacking_limits(&mddev->queue->limits);
        disk = alloc_disk(1 << shift);
        if (!disk) {
@@ -4933,7 +4689,7 @@ static int add_named_array(const char *val, struct kernel_param *kp)
 static void md_safemode_timeout(unsigned long data)
 {
-        struct mddev *mddev = (struct mddev *) data;
+        mddev_t *mddev = (mddev_t *) data;
        if (!atomic_read(&mddev->writes_pending)) {
                mddev->safemode = 1;
@@ -4945,11 +4701,11 @@ static void md_safemode_timeout(unsigned long data)
 static int start_dirty_degraded;
-int md_run(struct mddev *mddev)
+int md_run(mddev_t *mddev)
 {
        int err;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
-        struct md_personality *pers;
+        struct mdk_personality *pers;
        if (list_empty(&mddev->disks))
                /* cannot run an array with no devices.. */
@@ -4980,7 +4736,7 @@ int md_run(struct mddev *mddev)
         * the only valid external interface is through the md
         * device.
         */
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (test_bit(Faulty, &rdev->flags))
                        continue;
                sync_blockdev(rdev->bdev);
@@ -5012,7 +4768,8 @@ int md_run(struct mddev *mddev)
        }
        if (mddev->bio_set == NULL)
-                mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
+                mddev->bio_set = bioset_create(BIO_POOL_SIZE,
+                                               sizeof(mddev_t *));
        spin_lock(&pers_lock);
        pers = find_pers(mddev->level, mddev->clevel);
@@ -5047,11 +4804,11 @@ int md_run(struct mddev *mddev)
                 * configuration.
                 */
                char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
-                struct md_rdev *rdev2;
+                mdk_rdev_t *rdev2;
                int warned = 0;
-                rdev_for_each(rdev, mddev)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
-                        rdev_for_each(rdev2, mddev) {
+                        list_for_each_entry(rdev2, &mddev->disks, same_set) {
                                if (rdev < rdev2 &&
                                    rdev->bdev->bd_contains ==
                                    rdev2->bdev->bd_contains) {
@@ -5094,8 +4851,7 @@ int md_run(struct mddev *mddev)
                err = -EINVAL;
                mddev->pers->stop(mddev);
        }
-        if (err == 0 && mddev->pers->sync_request &&
+        if (err == 0 && mddev->pers->sync_request) {
-            (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
                err = bitmap_create(mddev);
                if (err) {
                        printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -5129,7 +4885,7 @@ int md_run(struct mddev *mddev)
        mddev->in_sync = 1;
        smp_wmb();
        mddev->ready = 1;
-        rdev_for_each(rdev, mddev)
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0)
                        if (sysfs_link_rdev(mddev, rdev))
                                /* failure here is OK */;
@@ -5147,7 +4903,7 @@ int md_run(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(md_run);
-static int do_md_run(struct mddev *mddev)
+static int do_md_run(mddev_t *mddev)
 {
        int err;
@@ -5171,7 +4927,7 @@ out:
        return err;
 }
-static int restart_array(struct mddev *mddev)
+static int restart_array(mddev_t *mddev)
 {
        struct gendisk *disk = mddev->gendisk;
@@ -5221,7 +4977,7 @@ void restore_bitmap_write_access(struct file *file)
        spin_unlock(&inode->i_lock);
 }
-static void md_clean(struct mddev *mddev)
+static void md_clean(mddev_t *mddev)
 {
        mddev->array_sectors = 0;
        mddev->external_size = 0;
@@ -5245,12 +5001,11 @@ static void md_clean(struct mddev *mddev)
        mddev->events = 0;
        mddev->can_decrease_events = 0;
        mddev->delta_disks = 0;
-        mddev->reshape_backwards = 0;
        mddev->new_level = LEVEL_NONE;
        mddev->new_layout = 0;
        mddev->new_chunk_sectors = 0;
        mddev->curr_resync = 0;
-        atomic64_set(&mddev->resync_mismatches, 0);
+        mddev->resync_mismatches = 0;
        mddev->suspend_lo = mddev->suspend_hi = 0;
        mddev->sync_speed_min = mddev->sync_speed_max = 0;
        mddev->recovery = 0;
@@ -5258,16 +5013,14 @@ static void md_clean(struct mddev *mddev)
        mddev->changed = 0;
        mddev->degraded = 0;
        mddev->safemode = 0;
-        mddev->merge_check_needed = 0;
        mddev->bitmap_info.offset = 0;
        mddev->bitmap_info.default_offset = 0;
-        mddev->bitmap_info.default_space = 0;
        mddev->bitmap_info.chunksize = 0;
        mddev->bitmap_info.daemon_sleep = 0;
        mddev->bitmap_info.max_write_behind = 0;
 }
-static void __md_stop_writes(struct mddev *mddev)
+static void __md_stop_writes(mddev_t *mddev)
 {
        if (mddev->sync_thread) {
                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -5287,7 +5040,7 @@ static void __md_stop_writes(struct mddev *mddev)
        }
 }
-void md_stop_writes(struct mddev *mddev)
+void md_stop_writes(mddev_t *mddev)
 {
        mddev_lock(mddev);
        __md_stop_writes(mddev);
@@ -5295,7 +5048,7 @@ void md_stop_writes(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(md_stop_writes);
-static void __md_stop(struct mddev *mddev)
+void md_stop(mddev_t *mddev)
 {
        mddev->ready = 0;
        mddev->pers->stop(mddev);
@@ -5305,31 +5058,17 @@ static void __md_stop(struct mddev *mddev)
        mddev->pers = NULL;
        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 }
-void md_stop(struct mddev *mddev)
-{
-        /* stop the array and free an attached data structures.
-         * This is called from dm-raid
-         */
-        __md_stop(mddev);
-        bitmap_destroy(mddev);
-        if (mddev->bio_set)
-                bioset_free(mddev->bio_set);
-}
 EXPORT_SYMBOL_GPL(md_stop);
-static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
+static int md_set_readonly(mddev_t *mddev, int is_open)
 {
        int err = 0;
        mutex_lock(&mddev->open_mutex);
-        if (atomic_read(&mddev->openers) > !!bdev) {
+        if (atomic_read(&mddev->openers) > is_open) {
                printk("md: %s still in use.\n",mdname(mddev));
                err = -EBUSY;
                goto out;
        }
-        if (bdev)
-                sync_blockdev(bdev);
        if (mddev->pers) {
                __md_stop_writes(mddev);
@@ -5351,40 +5090,32 @@ out:
 *   0 - completely stop and dis-assemble array
 *   2 - stop but do not disassemble array
 */
-static int do_md_stop(struct mddev * mddev, int mode,
+static int do_md_stop(mddev_t * mddev, int mode, int is_open)
-                      struct block_device *bdev)
 {
        struct gendisk *disk = mddev->gendisk;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        mutex_lock(&mddev->open_mutex);
-        if (atomic_read(&mddev->openers) > !!bdev ||
+        if (atomic_read(&mddev->openers) > is_open ||
            mddev->sysfs_active) {
                printk("md: %s still in use.\n",mdname(mddev));
                mutex_unlock(&mddev->open_mutex);
                return -EBUSY;
        }
-        if (bdev)
-                /* It is possible IO was issued on some other
-                 * open file which was closed before we took ->open_mutex.
-                 * As that was not the last close __blkdev_put will not
-                 * have called sync_blockdev, so we must.
-                 */
-                sync_blockdev(bdev);
        if (mddev->pers) {
                if (mddev->ro)
                        set_disk_ro(disk, 0);
                __md_stop_writes(mddev);
-                __md_stop(mddev);
+                md_stop(mddev);
                mddev->queue->merge_bvec_fn = NULL;
                mddev->queue->backing_dev_info.congested_fn = NULL;
                /* tell userspace to handle 'inactive' */
                sysfs_notify_dirent_safe(mddev->sysfs_state);
-                rdev_for_each(rdev, mddev)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0)
                                sysfs_unlink_rdev(mddev, rdev);
@@ -5425,9 +5156,9 @@ static int do_md_stop(struct mddev * mddev, int mode,
 }
 #ifndef MODULE
-static void autorun_array(struct mddev *mddev)
+static void autorun_array(mddev_t *mddev)
 {
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        int err;
        if (list_empty(&mddev->disks))
@@ -5435,7 +5166,7 @@ static void autorun_array(struct mddev *mddev)
        printk(KERN_INFO "md: running: ");
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                char b[BDEVNAME_SIZE];
                printk("<%s>", bdevname(rdev->bdev,b));
        }
@@ -5444,7 +5175,7 @@ static void autorun_array(struct mddev *mddev)
        err = do_md_run(mddev);
        if (err) {
                printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
-                do_md_stop(mddev, 0, NULL);
+                do_md_stop(mddev, 0, 0);
        }
 }
@@ -5462,8 +5193,8 @@ static void autorun_array(struct mddev *mddev)
 */
 static void autorun_devices(int part)
 {
-        struct md_rdev *rdev0, *rdev, *tmp;
+        mdk_rdev_t *rdev0, *rdev, *tmp;
-        struct mddev *mddev;
+        mddev_t *mddev;
        char b[BDEVNAME_SIZE];
        printk(KERN_INFO "md: autorun ...\n");
@@ -5472,7 +5203,7 @@ static void autorun_devices(int part)
                dev_t dev;
                LIST_HEAD(candidates);
                rdev0 = list_entry(pending_raid_disks.next,
-                                         struct md_rdev, same_set);
+                                         mdk_rdev_t, same_set);
                printk(KERN_INFO "md: considering %s ...\n",
                        bdevname(rdev0->bdev,b));
@@ -5558,15 +5289,14 @@ static int get_version(void __user * arg)
        return 0;
 }
-static int get_array_info(struct mddev * mddev, void __user * arg)
+static int get_array_info(mddev_t * mddev, void __user * arg)
 {
        mdu_array_info_t info;
        int nr,working,insync,failed,spare;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
-        nr = working = insync = failed = spare = 0;
+        nr=working=insync=failed=spare=0;
-        rcu_read_lock();
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
-        rdev_for_each_rcu(rdev, mddev) {
                nr++;
                if (test_bit(Faulty, &rdev->flags))
                        failed++;
@@ -5578,7 +5308,6 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
                                spare++;
                }
        }
-        rcu_read_unlock();
        info.major_version = mddev->major_version;
        info.minor_version = mddev->minor_version;
@@ -5613,7 +5342,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
        return 0;
 }
-static int get_bitmap_file(struct mddev * mddev, void __user * arg)
+static int get_bitmap_file(mddev_t * mddev, void __user * arg)
 {
        mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
        char *ptr, *buf = NULL;
@@ -5628,7 +5357,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
                goto out;
        /* bitmap disabled, zero the first byte and copy out */
-        if (!mddev->bitmap || !mddev->bitmap->storage.file) {
+        if (!mddev->bitmap || !mddev->bitmap->file) {
                file->pathname[0] = '\0';
                goto copy_out;
        }
@@ -5637,8 +5366,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
        if (!buf)
                goto out;
-        ptr = d_path(&mddev->bitmap->storage.file->f_path,
+        ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
-                     buf, sizeof(file->pathname));
        if (IS_ERR(ptr))
                goto out;
@@ -5654,16 +5382,15 @@ out:
        return err;
 }
-static int get_disk_info(struct mddev * mddev, void __user * arg)
+static int get_disk_info(mddev_t * mddev, void __user * arg)
 {
        mdu_disk_info_t info;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        if (copy_from_user(&info, arg, sizeof(info)))
                return -EFAULT;
-        rcu_read_lock();
+        rdev = find_rdev_nr(mddev, info.number);
-        rdev = find_rdev_nr_rcu(mddev, info.number);
        if (rdev) {
                info.major = MAJOR(rdev->bdev->bd_dev);
                info.minor = MINOR(rdev->bdev->bd_dev);
@@ -5682,7 +5409,6 @@ static int get_disk_info(struct mddev * mddev, void __user * arg)
                info.raid_disk = -1;
                info.state = (1<<MD_DISK_REMOVED);
        }
-        rcu_read_unlock();
        if (copy_to_user(arg, &info, sizeof(info)))
                return -EFAULT;
@@ -5690,10 +5416,10 @@ static int get_disk_info(struct mddev * mddev, void __user * arg)
        return 0;
 }
-static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 {
        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        dev_t dev = MKDEV(info->major,info->minor);
        if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
@@ -5710,9 +5436,8 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
                        return PTR_ERR(rdev);
                }
                if (!list_empty(&mddev->disks)) {
-                        struct md_rdev *rdev0
+                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
-                                = list_entry(mddev->disks.next,
+                                                        mdk_rdev_t, same_set);
-                                             struct md_rdev, same_set);
                        err = super_types[mddev->major_version]
                                .load_super(rdev, rdev0, mddev->minor_version);
                        if (err < 0) {
@@ -5766,7 +5491,8 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
                        super_types[mddev->major_version].
                                validate_super(mddev, rdev);
                if ((info->state & (1<<MD_DISK_SYNC)) &&
-                     rdev->raid_disk != info->raid_disk) {
+                    (!test_bit(In_sync, &rdev->flags) ||
+                     rdev->raid_disk != info->raid_disk)) {
                        /* This was a hot-add request, but events doesn't
                         * match, so reject it.
                         */
@@ -5861,10 +5587,10 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
        return 0;
 }
-static int hot_remove_disk(struct mddev * mddev, dev_t dev)
+static int hot_remove_disk(mddev_t * mddev, dev_t dev)
 {
        char b[BDEVNAME_SIZE];
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        rdev = find_rdev(mddev, dev);
        if (!rdev)
@@ -5884,11 +5610,11 @@ busy:
        return -EBUSY;
 }
-static int hot_add_disk(struct mddev * mddev, dev_t dev)
+static int hot_add_disk(mddev_t * mddev, dev_t dev)
 {
        char b[BDEVNAME_SIZE];
        int err;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        if (!mddev->pers)
                return -ENODEV;
@@ -5958,7 +5684,7 @@ abort_export:
        return err;
 }
-static int set_bitmap_file(struct mddev *mddev, int fd)
+static int set_bitmap_file(mddev_t *mddev, int fd)
 {
        int err;
@@ -6031,7 +5757,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
 *  The minor and patch _version numbers are also kept incase the
 *  super_block handler wishes to interpret them.
 */
-static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 {
        if (info->raid_disks == 0) {
@@ -6084,7 +5810,6 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
        mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
-        mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
        mddev->bitmap_info.offset = 0;
        mddev->reshape_position = MaxSector;
@@ -6098,12 +5823,11 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
        mddev->new_chunk_sectors = mddev->chunk_sectors;
        mddev->new_layout = mddev->layout;
        mddev->delta_disks = 0;
-        mddev->reshape_backwards = 0;
        return 0;
 }
-void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
+void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
 {
        WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
@@ -6114,9 +5838,9 @@ void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
 }
 EXPORT_SYMBOL(md_set_array_sectors);
-static int update_size(struct mddev *mddev, sector_t num_sectors)
+static int update_size(mddev_t *mddev, sector_t num_sectors)
 {
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        int rv;
        int fit = (num_sectors == 0);
@@ -6133,8 +5857,12 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
         */
        if (mddev->sync_thread)
                return -EBUSY;
+        if (mddev->bitmap)
-        rdev_for_each(rdev, mddev) {
+                /* Sorry, cannot grow a bitmap yet, just remove it,
+                 * grow, and re-add.
+                 */
+                return -EBUSY;
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                sector_t avail = rdev->sectors;
                if (fit && (num_sectors == 0 || num_sectors > avail))
@@ -6148,10 +5876,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
        return rv;
 }
-static int update_raid_disks(struct mddev *mddev, int raid_disks)
+static int update_raid_disks(mddev_t *mddev, int raid_disks)
 {
        int rv;
-        struct md_rdev *rdev;
        /* change the number of raid disks */
        if (mddev->pers->check_reshape == NULL)
                return -EINVAL;
@@ -6160,27 +5887,11 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
                return -EINVAL;
        if (mddev->sync_thread || mddev->reshape_position != MaxSector)
                return -EBUSY;
-        rdev_for_each(rdev, mddev) {
-                if (mddev->raid_disks < raid_disks &&
-                    rdev->data_offset < rdev->new_data_offset)
-                        return -EINVAL;
-                if (mddev->raid_disks > raid_disks &&
-                    rdev->data_offset > rdev->new_data_offset)
-                        return -EINVAL;
-        }
        mddev->delta_disks = raid_disks - mddev->raid_disks;
-        if (mddev->delta_disks < 0)
-                mddev->reshape_backwards = 1;
-        else if (mddev->delta_disks > 0)
-                mddev->reshape_backwards = 0;
        rv = mddev->pers->check_reshape(mddev);
-        if (rv < 0) {
+        if (rv < 0)
                mddev->delta_disks = 0;
-                mddev->reshape_backwards = 0;
-        }
        return rv;
 }
@@ -6193,7 +5904,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
 * Any differences that cannot be handled will cause an error.
 * Normally, only one change can be managed at a time.
 */
-static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
+static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 {
        int rv = 0;
        int cnt = 0;
@@ -6263,8 +5974,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
                                return -EINVAL;
                        mddev->bitmap_info.offset =
                                mddev->bitmap_info.default_offset;
-                        mddev->bitmap_info.space =
-                                mddev->bitmap_info.default_space;
                        mddev->pers->quiesce(mddev, 1);
                        rv = bitmap_create(mddev);
                        if (!rv)
@@ -6276,7 +5985,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
                        /* remove the bitmap */
                        if (!mddev->bitmap)
                                return -ENOENT;
-                        if (mddev->bitmap->storage.file)
+                        if (mddev->bitmap->file)
                                return -EINVAL;
                        mddev->pers->quiesce(mddev, 1);
                        bitmap_destroy(mddev);
@@ -6288,25 +5997,21 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
        return rv;
 }
-static int set_disk_faulty(struct mddev *mddev, dev_t dev)
+static int set_disk_faulty(mddev_t *mddev, dev_t dev)
 {
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
-        int err = 0;
        if (mddev->pers == NULL)
                return -ENODEV;
-        rcu_read_lock();
+        rdev = find_rdev(mddev, dev);
-        rdev = find_rdev_rcu(mddev, dev);
        if (!rdev)
-                err =  -ENODEV;
+                return -ENODEV;
-        else {
-                md_error(mddev, rdev);
+        md_error(mddev, rdev);
-                if (!test_bit(Faulty, &rdev->flags))
+        if (!test_bit(Faulty, &rdev->flags))
-                        err = -EBUSY;
+                return -EBUSY;
-        }
+        return 0;
-        rcu_read_unlock();
-        return err;
 }
 /*
@@ -6317,7 +6022,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev)
 */
 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-        struct mddev *mddev = bdev->bd_disk->private_data;
+        mddev_t *mddev = bdev->bd_disk->private_data;
        geo->heads = 2;
        geo->sectors = 4;
@@ -6330,40 +6035,34 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 {
        int err = 0;
        void __user *argp = (void __user *)arg;
-        struct mddev *mddev = NULL;
+        mddev_t *mddev = NULL;
        int ro;
-        switch (cmd) {
+        if (!capable(CAP_SYS_ADMIN))
-        case RAID_VERSION:
+                return -EACCES;
-        case GET_ARRAY_INFO:
-        case GET_DISK_INFO:
-                break;
-        default:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EACCES;
-        }
        /*
         * Commands dealing with the RAID driver but not any
         * particular array:
         */
-        switch (cmd) {
+        switch (cmd)
-        case RAID_VERSION:
+        {
-                err = get_version(argp);
+                case RAID_VERSION:
-                goto done;
+                        err = get_version(argp);
+                        goto done;
-        case PRINT_RAID_DEBUG:
+                case PRINT_RAID_DEBUG:
-                err = 0;
+                        err = 0;
-                md_print_devices();
+                        md_print_devices();
-                goto done;
+                        goto done;
 #ifndef MODULE
-        case RAID_AUTORUN:
+                case RAID_AUTORUN:
-                err = 0;
+                        err = 0;
-                autostart_arrays(arg);
+                        autostart_arrays(arg);
-                goto done;
+                        goto done;
 #endif
-        default:;
+                default:;
        }
        /*
@@ -6377,31 +6076,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
                goto abort;
        }
-        /* Some actions do not requires the mutex */
-        switch (cmd) {
-        case GET_ARRAY_INFO:
-                if (!mddev->raid_disks && !mddev->external)
-                        err = -ENODEV;
-                else
-                        err = get_array_info(mddev, argp);
-                goto abort;
-        case GET_DISK_INFO:
-                if (!mddev->raid_disks && !mddev->external)
-                        err = -ENODEV;
-                else
-                        err = get_disk_info(mddev, argp);
-                goto abort;
-        case SET_DISK_FAULTY:
-                err = set_disk_faulty(mddev, new_decode_dev(arg));
-                goto abort;
-        }
-        if (cmd == ADD_NEW_DISK)
-                /* need to ensure md_delayed_delete() has completed */
-                flush_workqueue(md_misc_wq);
        err = mddev_lock(mddev);
        if (err) {
                printk(KERN_INFO 
@@ -6410,44 +6084,50 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
                goto abort;
        }
-        if (cmd == SET_ARRAY_INFO) {
+        switch (cmd)
-                mdu_array_info_t info;
+        {
-                if (!arg)
+                case SET_ARRAY_INFO:
-                        memset(&info, 0, sizeof(info));
+                        {
-                else if (copy_from_user(&info, argp, sizeof(info))) {
+                                mdu_array_info_t info;
-                        err = -EFAULT;
+                                if (!arg)
-                        goto abort_unlock;
+                                        memset(&info, 0, sizeof(info));
-                }
+                                else if (copy_from_user(&info, argp, sizeof(info))) {
-                if (mddev->pers) {
+                                        err = -EFAULT;
-                        err = update_array_info(mddev, &info);
+                                        goto abort_unlock;
-                        if (err) {
+                                }
-                                printk(KERN_WARNING "md: couldn't update"
+                                if (mddev->pers) {
-                                       " array info. %d\n", err);
+                                        err = update_array_info(mddev, &info);
-                                goto abort_unlock;
+                                        if (err) {
+                                                printk(KERN_WARNING "md: couldn't update"
+                                                       " array info. %d\n", err);
+                                                goto abort_unlock;
+                                        }
+                                        goto done_unlock;
+                                }
+                                if (!list_empty(&mddev->disks)) {
+                                        printk(KERN_WARNING
+                                               "md: array %s already has disks!\n",
+                                               mdname(mddev));
+                                        err = -EBUSY;
+                                        goto abort_unlock;
+                                }
+                                if (mddev->raid_disks) {
+                                        printk(KERN_WARNING
+                                               "md: array %s already initialised!\n",
+                                               mdname(mddev));
+                                        err = -EBUSY;
+                                        goto abort_unlock;
+                                }
+                                err = set_array_info(mddev, &info);
+                                if (err) {
+                                        printk(KERN_WARNING "md: couldn't set"
+                                               " array info. %d\n", err);
+                                        goto abort_unlock;
+                                }
                        }
                        goto done_unlock;
-                }
-                if (!list_empty(&mddev->disks)) {
+                default:;
-                        printk(KERN_WARNING
-                               "md: array %s already has disks!\n",
-                               mdname(mddev));
-                        err = -EBUSY;
-                        goto abort_unlock;
-                }
-                if (mddev->raid_disks) {
-                        printk(KERN_WARNING
-                               "md: array %s already initialised!\n",
-                               mdname(mddev));
-                        err = -EBUSY;
-                        goto abort_unlock;
-                }
-                err = set_array_info(mddev, &info);
-                if (err) {
-                        printk(KERN_WARNING "md: couldn't set"
-                               " array info. %d\n", err);
-                        goto abort_unlock;
-                }
-                goto done_unlock;
        }
        /*
@@ -6466,51 +6146,60 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
        /*
         * Commands even a read-only array can execute:
         */
-        switch (cmd) {
+        switch (cmd)
-        case GET_BITMAP_FILE:
+        {
-                err = get_bitmap_file(mddev, argp);
+                case GET_ARRAY_INFO:
-                goto done_unlock;
+                        err = get_array_info(mddev, argp);
+                        goto done_unlock;
-        case RESTART_ARRAY_RW:
-                err = restart_array(mddev);
-                goto done_unlock;
-        case STOP_ARRAY:
+                case GET_BITMAP_FILE:
-                err = do_md_stop(mddev, 0, bdev);
+                        err = get_bitmap_file(mddev, argp);
-                goto done_unlock;
+                        goto done_unlock;
-        case STOP_ARRAY_RO:
+                case GET_DISK_INFO:
-                err = md_set_readonly(mddev, bdev);
+                        err = get_disk_info(mddev, argp);
-                goto done_unlock;
+                        goto done_unlock;
-        case BLKROSET:
+                case RESTART_ARRAY_RW:
-                if (get_user(ro, (int __user *)(arg))) {
+                        err = restart_array(mddev);
-                        err = -EFAULT;
                        goto done_unlock;
-                }
-                err = -EINVAL;
-                /* if the bdev is going readonly the value of mddev->ro
+                case STOP_ARRAY:
-                 * does not matter, no writes are coming
+                        err = do_md_stop(mddev, 0, 1);
-                 */
-                if (ro)
                        goto done_unlock;
-                /* are we are already prepared for writes? */
+                case STOP_ARRAY_RO:
-                if (mddev->ro != 1)
+                        err = md_set_readonly(mddev, 1);
                        goto done_unlock;
-                /* transitioning to readauto need only happen for
+                case BLKROSET:
-                 * arrays that call md_write_start
+                        if (get_user(ro, (int __user *)(arg))) {
-                 */
+                                err = -EFAULT;
-                if (mddev->pers) {
+                                goto done_unlock;
-                        err = restart_array(mddev);
-                        if (err == 0) {
-                                mddev->ro = 2;
-                                set_disk_ro(mddev->gendisk, 0);
                        }
-                }
+                        err = -EINVAL;
-                goto done_unlock;
+                        /* if the bdev is going readonly the value of mddev->ro
+                         * does not matter, no writes are coming
+                         */
+                        if (ro)
+                                goto done_unlock;
+                        /* are we are already prepared for writes? */
+                        if (mddev->ro != 1)
+                                goto done_unlock;
+                        /* transitioning to readauto need only happen for
+                         * arrays that call md_write_start
+                         */
+                        if (mddev->pers) {
+                                err = restart_array(mddev);
+                                if (err == 0) {
+                                        mddev->ro = 2;
+                                        set_disk_ro(mddev->gendisk, 0);
+                                }
+                        }
+                        goto done_unlock;
        }
        /*
@@ -6532,36 +6221,41 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
                }
        }
-        switch (cmd) {
+        switch (cmd)
-        case ADD_NEW_DISK:
        {
-                mdu_disk_info_t info;
+                case ADD_NEW_DISK:
-                if (copy_from_user(&info, argp, sizeof(info)))
+                {
-                        err = -EFAULT;
+                        mdu_disk_info_t info;
-                else
+                        if (copy_from_user(&info, argp, sizeof(info)))
-                        err = add_new_disk(mddev, &info);
+                                err = -EFAULT;
-                goto done_unlock;
+                        else
-        }
+                                err = add_new_disk(mddev, &info);
+                        goto done_unlock;
+                }
-        case HOT_REMOVE_DISK:
+                case HOT_REMOVE_DISK:
-                err = hot_remove_disk(mddev, new_decode_dev(arg));
+                        err = hot_remove_disk(mddev, new_decode_dev(arg));
-                goto done_unlock;
+                        goto done_unlock;
-        case HOT_ADD_DISK:
+                case HOT_ADD_DISK:
-                err = hot_add_disk(mddev, new_decode_dev(arg));
+                        err = hot_add_disk(mddev, new_decode_dev(arg));
-                goto done_unlock;
+                        goto done_unlock;
-        case RUN_ARRAY:
+                case SET_DISK_FAULTY:
-                err = do_md_run(mddev);
+                        err = set_disk_faulty(mddev, new_decode_dev(arg));
-                goto done_unlock;
+                        goto done_unlock;
-        case SET_BITMAP_FILE:
+                case RUN_ARRAY:
-                err = set_bitmap_file(mddev, (int)arg);
+                        err = do_md_run(mddev);
-                goto done_unlock;
+                        goto done_unlock;
-        default:
+                case SET_BITMAP_FILE:
-                err = -EINVAL;
+                        err = set_bitmap_file(mddev, (int)arg);
-                goto abort_unlock;
+                        goto done_unlock;
+                default:
+                        err = -EINVAL;
+                        goto abort_unlock;
        }
 done_unlock:
@@ -6604,12 +6298,9 @@ static int md_open(struct block_device *bdev, fmode_t mode)
         * Succeed if we can lock the mddev, which confirms that
         * it isn't being stopped right now.
         */
-        struct mddev *mddev = mddev_find(bdev->bd_dev);
+        mddev_t *mddev = mddev_find(bdev->bd_dev);
        int err;
-        if (!mddev)
-                return -ENODEV;
        if (mddev->gendisk != bdev->bd_disk) {
                /* we are racing with mddev_put which is discarding this
                 * bd_disk.
@@ -6636,7 +6327,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 static int md_release(struct gendisk *disk, fmode_t mode)
 {
-        struct mddev *mddev = disk->private_data;
+        mddev_t *mddev = disk->private_data;
        BUG_ON(!mddev);
        atomic_dec(&mddev->openers);
@@ -6647,14 +6338,14 @@ static int md_release(struct gendisk *disk, fmode_t mode)
 static int md_media_changed(struct gendisk *disk)
 {
-        struct mddev *mddev = disk->private_data;
+        mddev_t *mddev = disk->private_data;
        return mddev->changed;
 }
 static int md_revalidate(struct gendisk *disk)
 {
-        struct mddev *mddev = disk->private_data;
+        mddev_t *mddev = disk->private_data;
        mddev->changed = 0;
        return 0;
@@ -6675,7 +6366,7 @@ static const struct block_device_operations md_fops =
 static int md_thread(void * arg)
 {
-        struct md_thread *thread = arg;
+        mdk_thread_t *thread = arg;
        /*
         * md_thread is a 'system-thread', it's priority should be very
@@ -6708,27 +6399,27 @@ static int md_thread(void * arg)
                clear_bit(THREAD_WAKEUP, &thread->flags);
                if (!kthread_should_stop())
-                        thread->run(thread);
+                        thread->run(thread->mddev);
        }
        return 0;
 }
-void md_wakeup_thread(struct md_thread *thread)
+void md_wakeup_thread(mdk_thread_t *thread)
 {
        if (thread) {
-                pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
+                dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
                set_bit(THREAD_WAKEUP, &thread->flags);
                wake_up(&thread->wqueue);
        }
 }
-struct md_thread *md_register_thread(void (*run) (struct md_thread *),
+mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
-                struct mddev *mddev, const char *name)
+                                 const char *name)
 {
-        struct md_thread *thread;
+        mdk_thread_t *thread;
-        thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
+        thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
        if (!thread)
                return NULL;
@@ -6740,7 +6431,7 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *),
        thread->tsk = kthread_run(md_thread, thread,
                                  "%s_%s",
                                  mdname(thread->mddev),
-                                  name);
+                                  name ?: mddev->pers->name);
        if (IS_ERR(thread->tsk)) {
                kfree(thread);
                return NULL;
@@ -6748,12 +6439,12 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *),
        return thread;
 }
-void md_unregister_thread(struct md_thread **threadp)
+void md_unregister_thread(mdk_thread_t **threadp)
 {
-        struct md_thread *thread = *threadp;
+        mdk_thread_t *thread = *threadp;
        if (!thread)
                return;
-        pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
+        dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
        /* Locking ensures that mddev_unlock does not wake_up a
         * non-existent thread
         */
@@ -6765,7 +6456,7 @@ void md_unregister_thread(struct md_thread **threadp)
        kfree(thread);
 }
-void md_error(struct mddev *mddev, struct md_rdev *rdev)
+void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        if (!mddev) {
                MD_BUG();
@@ -6794,7 +6485,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
 static void status_unused(struct seq_file *seq)
 {
        int i = 0;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        seq_printf(seq, "unused devices: ");
@@ -6811,7 +6502,7 @@ static void status_unused(struct seq_file *seq)
 }
-static void status_resync(struct seq_file *seq, struct mddev * mddev)
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
 {
        sector_t max_sectors, resync, res;
        unsigned long dt, db;
@@ -6819,14 +6510,9 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev)
        int scale;
        unsigned int per_milli;
-        if (mddev->curr_resync <= 3)
+        resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
-                resync = 0;
-        else
-                resync = mddev->curr_resync
-                        - atomic_read(&mddev->recovery_active);
-        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-            test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
                max_sectors = mddev->resync_max_sectors;
        else
                max_sectors = mddev->dev_sectors;
@@ -6907,7 +6593,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
 {
        struct list_head *tmp;
        loff_t l = *pos;
-        struct mddev *mddev;
+        mddev_t *mddev;
        if (l >= 0x10000)
                return NULL;
@@ -6918,7 +6604,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
        spin_lock(&all_mddevs_lock);
        list_for_each(tmp,&all_mddevs)
                if (!l--) {
-                        mddev = list_entry(tmp, struct mddev, all_mddevs);
+                        mddev = list_entry(tmp, mddev_t, all_mddevs);
                        mddev_get(mddev);
                        spin_unlock(&all_mddevs_lock);
                        return mddev;
@@ -6932,7 +6618,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
        struct list_head *tmp;
-        struct mddev *next_mddev, *mddev = v;
+        mddev_t *next_mddev, *mddev = v;
        
        ++*pos;
        if (v == (void*)2)
@@ -6944,7 +6630,7 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        else
                tmp = mddev->all_mddevs.next;
        if (tmp != &all_mddevs)
-                next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
+                next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
        else {
                next_mddev = (void*)2;
                *pos = 0x10000;
@@ -6959,7 +6645,7 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static void md_seq_stop(struct seq_file *seq, void *v)
 {
-        struct mddev *mddev = v;
+        mddev_t *mddev = v;
        if (mddev && v != (void*)1 && v != (void*)2)
                mddev_put(mddev);
@@ -6967,12 +6653,13 @@ static void md_seq_stop(struct seq_file *seq, void *v)
 static int md_seq_show(struct seq_file *seq, void *v)
 {
-        struct mddev *mddev = v;
+        mddev_t *mddev = v;
        sector_t sectors;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
+        struct bitmap *bitmap;
        if (v == (void*)1) {
-                struct md_personality *pers;
+                struct mdk_personality *pers;
                seq_printf(seq, "Personalities : ");
                spin_lock(&pers_lock);
                list_for_each_entry(pers, &pers_list, list)
@@ -7003,7 +6690,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                }
                sectors = 0;
-                rdev_for_each(rdev, mddev) {
+                list_for_each_entry(rdev, &mddev->disks, same_set) {
                        char b[BDEVNAME_SIZE];
                        seq_printf(seq, " %s[%d]",
                                bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -7012,11 +6699,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
                        if (test_bit(Faulty, &rdev->flags)) {
                                seq_printf(seq, "(F)");
                                continue;
-                        }
+                        } else if (rdev->raid_disk < 0)
-                        if (rdev->raid_disk < 0)
                                seq_printf(seq, "(S)"); /* spare */
-                        if (test_bit(Replacement, &rdev->flags))
-                                seq_printf(seq, "(R)");
                        sectors += rdev->sectors;
                }
@@ -7049,7 +6733,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                                if (mddev->curr_resync > 2) {
                                        status_resync(seq, mddev);
                                        seq_printf(seq, "\n      ");
-                                } else if (mddev->curr_resync >= 1)
+                                } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
                                        seq_printf(seq, "\tresync=DELAYED\n      ");
                                else if (mddev->recovery_cp < MaxSector)
                                        seq_printf(seq, "\tresync=PENDING\n      ");
@@ -7057,7 +6741,27 @@ static int md_seq_show(struct seq_file *seq, void *v)
                } else
                        seq_printf(seq, "\n       ");
-                bitmap_status(seq, mddev->bitmap);
+                if ((bitmap = mddev->bitmap)) {
+                        unsigned long chunk_kb;
+                        unsigned long flags;
+                        spin_lock_irqsave(&bitmap->lock, flags);
+                        chunk_kb = mddev->bitmap_info.chunksize >> 10;
+                        seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
+                                "%lu%s chunk",
+                                bitmap->pages - bitmap->missing_pages,
+                                bitmap->pages,
+                                (bitmap->pages - bitmap->missing_pages)
+                                        << (PAGE_SHIFT - 10),
+                                chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
+                                chunk_kb ? "KB" : "B");
+                        if (bitmap->file) {
+                                seq_printf(seq, ", file: ");
+                                seq_path(seq, &bitmap->file->f_path, " \t\n");
+                        }
+                        seq_printf(seq, "\n");
+                        spin_unlock_irqrestore(&bitmap->lock, flags);
+                }
                seq_printf(seq, "\n");
        }
@@ -7111,7 +6815,7 @@ static const struct file_operations md_seq_fops = {
        .poll           = mdstat_poll,
 };
-int register_md_personality(struct md_personality *p)
+int register_md_personality(struct mdk_personality *p)
 {
        spin_lock(&pers_lock);
        list_add_tail(&p->list, &pers_list);
@@ -7120,7 +6824,7 @@ int register_md_personality(struct md_personality *p)
        return 0;
 }
-int unregister_md_personality(struct md_personality *p)
+int unregister_md_personality(struct mdk_personality *p)
 {
        printk(KERN_INFO "md: %s personality unregistered\n", p->name);
        spin_lock(&pers_lock);
@@ -7129,9 +6833,9 @@ int unregister_md_personality(struct md_personality *p)
        return 0;
 }
-static int is_mddev_idle(struct mddev *mddev, int init)
+static int is_mddev_idle(mddev_t *mddev, int init)
 {
-        struct md_rdev * rdev;
+        mdk_rdev_t * rdev;
        int idle;
        int curr_events;
@@ -7173,14 +6877,13 @@ static int is_mddev_idle(struct mddev *mddev, int init)
        return idle;
 }
-void md_done_sync(struct mddev *mddev, int blocks, int ok)
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
 {
        /* another "blocks" (512byte) blocks have been synced */
        atomic_sub(blocks, &mddev->recovery_active);
        wake_up(&mddev->recovery_wait);
        if (!ok) {
                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
                md_wakeup_thread(mddev->thread);
                // stop recovery, signal do_sync ....
        }
@@ -7192,7 +6895,7 @@ void md_done_sync(struct mddev *mddev, int blocks, int ok)
 * in superblock) before writing, schedule a superblock update
 * and wait for it to complete.
 */
-void md_write_start(struct mddev *mddev, struct bio *bi)
+void md_write_start(mddev_t *mddev, struct bio *bi)
 {
        int did_change = 0;
        if (bio_data_dir(bi) != WRITE)
@@ -7227,7 +6930,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
                   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 }
-void md_write_end(struct mddev *mddev)
+void md_write_end(mddev_t *mddev)
 {
        if (atomic_dec_and_test(&mddev->writes_pending)) {
                if (mddev->safemode == 2)
@@ -7246,7 +6949,7 @@ void md_write_end(struct mddev *mddev)
 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
 * is dropped, so return -EAGAIN after notifying userspace.
 */
-int md_allow_write(struct mddev *mddev)
+int md_allow_write(mddev_t *mddev)
 {
        if (!mddev->pers)
                return 0;
@@ -7278,24 +6981,20 @@ EXPORT_SYMBOL_GPL(md_allow_write);
 #define SYNC_MARKS      10
 #define SYNC_MARK_STEP  (3*HZ)
-#define UPDATE_FREQUENCY (5*60*HZ)
+void md_do_sync(mddev_t *mddev)
-void md_do_sync(struct md_thread *thread)
 {
-        struct mddev *mddev = thread->mddev;
+        mddev_t *mddev2;
-        struct mddev *mddev2;
        unsigned int currspeed = 0,
                 window;
        sector_t max_sectors,j, io_sectors;
        unsigned long mark[SYNC_MARKS];
-        unsigned long update_time;
        sector_t mark_cnt[SYNC_MARKS];
        int last_mark,m;
        struct list_head *tmp;
        sector_t last_check;
        int skipped = 0;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        char *desc;
-        struct blk_plug plug;
        /* just incase thread restarts... */
        if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -7386,7 +7085,7 @@ void md_do_sync(struct md_thread *thread)
                 * which defaults to physical size, but can be virtual size
                 */
                max_sectors = mddev->resync_max_sectors;
-                atomic64_set(&mddev->resync_mismatches, 0);
+                mddev->resync_mismatches = 0;
                /* we don't use the checkpoint if there's a bitmap */
                if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
                        j = mddev->resync_min;
@@ -7394,13 +7093,13 @@ void md_do_sync(struct md_thread *thread)
                        j = mddev->recovery_cp;
        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-                max_sectors = mddev->resync_max_sectors;
+                max_sectors = mddev->dev_sectors;
        else {
                /* recovery follows the physical size of devices */
                max_sectors = mddev->dev_sectors;
                j = MaxSector;
                rcu_read_lock();
-                rdev_for_each_rcu(rdev, mddev)
+                list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0 &&
                            !test_bit(Faulty, &rdev->flags) &&
                            !test_bit(In_sync, &rdev->flags) &&
@@ -7442,14 +7141,9 @@ void md_do_sync(struct md_thread *thread)
                       "md: resuming %s of %s from checkpoint.\n",
                       desc, mdname(mddev));
                mddev->curr_resync = j;
-        } else
+        }
-                mddev->curr_resync = 3; /* no longer delayed */
        mddev->curr_resync_completed = j;
-        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
-        md_new_event(mddev);
-        update_time = jiffies;
-        blk_start_plug(&plug);
        while (j < max_sectors) {
                sector_t sectors;
@@ -7459,7 +7153,6 @@ void md_do_sync(struct md_thread *thread)
                    ((mddev->curr_resync > mddev->curr_resync_completed &&
                      (mddev->curr_resync - mddev->curr_resync_completed)
                      > (max_sectors >> 4)) ||
-                     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
                     (j - mddev->curr_resync_completed)*2
                     >= mddev->resync_max - mddev->curr_resync_completed
                            )) {
@@ -7467,10 +7160,6 @@ void md_do_sync(struct md_thread *thread)
                        wait_event(mddev->recovery_wait,
                                   atomic_read(&mddev->recovery_active) == 0);
                        mddev->curr_resync_completed = j;
-                        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
-                            j > mddev->recovery_cp)
-                                mddev->recovery_cp = j;
-                        update_time = jiffies;
                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
                }
@@ -7505,8 +7194,7 @@ void md_do_sync(struct md_thread *thread)
                        break;
                j += sectors;
-                if (j > 2)
+                if (j>1) mddev->curr_resync = j;
-                        mddev->curr_resync = j;
                mddev->curr_mark_cnt = io_sectors;
                if (last_check == 0)
                        /* this is the earliest that rebuild will be
@@ -7561,7 +7249,6 @@ void md_do_sync(struct md_thread *thread)
         * this also signals 'finished resyncing' to md_stop
         */
 out:
-        blk_finish_plug(&plug);
        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
        /* tell personality that we are finished */
@@ -7575,13 +7262,7 @@ void md_do_sync(struct md_thread *thread)
                                        printk(KERN_INFO
                                               "md: checkpointing %s of %s.\n",
                                               desc, mdname(mddev));
-                                        if (test_bit(MD_RECOVERY_ERROR,
+                                        mddev->recovery_cp = mddev->curr_resync;
-                                                &mddev->recovery))
-                                                mddev->recovery_cp =
-                                                        mddev->curr_resync_completed;
-                                        else
-                                                mddev->recovery_cp =
-                                                        mddev->curr_resync;
                                }
                        } else
                                mddev->recovery_cp = MaxSector;
@@ -7589,7 +7270,7 @@ void md_do_sync(struct md_thread *thread)
                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
                                mddev->curr_resync = MaxSector;
                        rcu_read_lock();
-                        rdev_for_each_rcu(rdev, mddev)
+                        list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
                                if (rdev->raid_disk >= 0 &&
                                    mddev->delta_disks >= 0 &&
                                    !test_bit(Faulty, &rdev->flags) &&
@@ -7599,9 +7280,9 @@ void md_do_sync(struct md_thread *thread)
                        rcu_read_unlock();
                }
        }
- skip:
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ skip:
        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
                /* We completed so min/max setting can be forgotten if used. */
                if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
@@ -7627,56 +7308,53 @@ void md_do_sync(struct md_thread *thread)
 }
 EXPORT_SYMBOL_GPL(md_do_sync);
-static int remove_and_add_spares(struct mddev *mddev)
+static int remove_and_add_spares(mddev_t *mddev)
 {
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        int spares = 0;
-        int removed = 0;
-        rdev_for_each(rdev, mddev)
+        mddev->curr_resync_completed = 0;
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0 &&
                    !test_bit(Blocked, &rdev->flags) &&
                    (test_bit(Faulty, &rdev->flags) ||
                     ! test_bit(In_sync, &rdev->flags)) &&
                    atomic_read(&rdev->nr_pending)==0) {
                        if (mddev->pers->hot_remove_disk(
-                                    mddev, rdev) == 0) {
+                                    mddev, rdev->raid_disk)==0) {
                                sysfs_unlink_rdev(mddev, rdev);
                                rdev->raid_disk = -1;
-                                removed++;
                        }
                }
-        if (removed)
-                sysfs_notify(&mddev->kobj, NULL,
-                             "degraded");
-        rdev_for_each(rdev, mddev) {
+        if (mddev->degraded) {
-                if (rdev->raid_disk >= 0 &&
+                list_for_each_entry(rdev, &mddev->disks, same_set) {
-                    !test_bit(In_sync, &rdev->flags) &&
+                        if (rdev->raid_disk >= 0 &&
-                    !test_bit(Faulty, &rdev->flags))
+                            !test_bit(In_sync, &rdev->flags) &&
-                        spares++;
+                            !test_bit(Faulty, &rdev->flags))
-                if (rdev->raid_disk < 0
-                    && !test_bit(Faulty, &rdev->flags)) {
-                        rdev->recovery_offset = 0;
-                        if (mddev->pers->
-                            hot_add_disk(mddev, rdev) == 0) {
-                                if (sysfs_link_rdev(mddev, rdev))
-                                        /* failure here is OK */;
                                spares++;
-                                md_new_event(mddev);
+                        if (rdev->raid_disk < 0
-                                set_bit(MD_CHANGE_DEVS, &mddev->flags);
+                            && !test_bit(Faulty, &rdev->flags)) {
+                                rdev->recovery_offset = 0;
+                                if (mddev->pers->
+                                    hot_add_disk(mddev, rdev) == 0) {
+                                        if (sysfs_link_rdev(mddev, rdev))
+                                                /* failure here is OK */;
+                                        spares++;
+                                        md_new_event(mddev);
+                                        set_bit(MD_CHANGE_DEVS, &mddev->flags);
+                                } else
+                                        break;
                        }
                }
        }
-        if (removed)
-                set_bit(MD_CHANGE_DEVS, &mddev->flags);
        return spares;
 }
-static void reap_sync_thread(struct mddev *mddev)
+static void reap_sync_thread(mddev_t *mddev)
 {
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        /* resync has finished, collect result */
        md_unregister_thread(&mddev->sync_thread);
@@ -7684,28 +7362,22 @@ static void reap_sync_thread(struct mddev *mddev)
            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
                /* success...*/
                /* activate any spares */
-                if (mddev->pers->spare_active(mddev)) {
+                if (mddev->pers->spare_active(mddev))
                        sysfs_notify(&mddev->kobj, NULL,
                                     "degraded");
-                        set_bit(MD_CHANGE_DEVS, &mddev->flags);
-                }
        }
        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
            mddev->pers->finish_reshape)
                mddev->pers->finish_reshape(mddev);
+        md_update_sb(mddev, 1);
-        /* If array is no-longer degraded, then any saved_raid_disk
+        /* if array is no-longer degraded, then any saved_raid_disk
-         * information must be scrapped.  Also if any device is now
+         * information must be scrapped
-         * In_sync we must scrape the saved_raid_disk for that device
-         * do the superblock for an incrementally recovered device
-         * written out.
         */
-        rdev_for_each(rdev, mddev)
+        if (!mddev->degraded)
-                if (!mddev->degraded ||
+                list_for_each_entry(rdev, &mddev->disks, same_set)
-                    test_bit(In_sync, &rdev->flags))
                        rdev->saved_raid_disk = -1;
-        md_update_sb(mddev, 1);
        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
        clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -7741,7 +7413,7 @@ static void reap_sync_thread(struct mddev *mddev)
 *  5/ If array is degraded, try to add spares devices
 *  6/ If array has spares or is not in-sync, start a resync thread.
 */
-void md_check_recovery(struct mddev *mddev)
+void md_check_recovery(mddev_t *mddev)
 {
        if (mddev->suspended)
                return;
@@ -7777,14 +7449,14 @@ void md_check_recovery(struct mddev *mddev)
                        /* Only thing we do on a ro array is remove
                         * failed devices.
                         */
-                        struct md_rdev *rdev;
+                        mdk_rdev_t *rdev;
-                        rdev_for_each(rdev, mddev)
+                        list_for_each_entry(rdev, &mddev->disks, same_set)
                                if (rdev->raid_disk >= 0 &&
                                    !test_bit(Blocked, &rdev->flags) &&
                                    test_bit(Faulty, &rdev->flags) &&
                                    atomic_read(&rdev->nr_pending)==0) {
                                        if (mddev->pers->hot_remove_disk(
-                                                    mddev, rdev) == 0) {
+                                                    mddev, rdev->raid_disk)==0) {
                                                sysfs_unlink_rdev(mddev, rdev);
                                                rdev->raid_disk = -1;
                                        }
@@ -7827,21 +7499,20 @@ void md_check_recovery(struct mddev *mddev)
                /* Set RUNNING before clearing NEEDED to avoid
                 * any transients in the value of "sync_action".
                 */
-                mddev->curr_resync_completed = 0;
                set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+                clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                /* Clear some bits that don't mean anything, but
                 * might be left set
                 */
                clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
                clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
-                if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+                if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
-                    test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
                        goto unlock;
                /* no recovery is running.
                 * remove any failed drives, then
                 * add spares if possible.
-                 * Spares are also removed and re-added, to allow
+                 * Spare are also removed and re-added, to allow
                 * the personality to fail the re-add.
                 */
@@ -7865,7 +7536,7 @@ void md_check_recovery(struct mddev *mddev)
                        goto unlock;
                if (mddev->pers->sync_request) {
-                        if (spares) {
+                        if (spares && mddev->bitmap && ! mddev->bitmap->file) {
                                /* We are adding a device or devices to an array
                                 * which has the bitmap stored on all devices.
                                 * So make sure all bitmap pages get written
@@ -7902,7 +7573,7 @@ void md_check_recovery(struct mddev *mddev)
        }
 }
-void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
+void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 {
        sysfs_notify_dirent_safe(rdev->sysfs_state);
        wait_event_timeout(rdev->blocked_wait,
@@ -7913,20 +7584,6 @@ void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
 }
 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
-void md_finish_reshape(struct mddev *mddev)
-{
-        /* called be personality module when reshape completes. */
-        struct md_rdev *rdev;
-        rdev_for_each(rdev, mddev) {
-                if (rdev->data_offset > rdev->new_data_offset)
-                        rdev->sectors += rdev->data_offset - rdev->new_data_offset;
-                else
-                        rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
-                rdev->data_offset = rdev->new_data_offset;
-        }
-}
-EXPORT_SYMBOL(md_finish_reshape);
 /* Bad block management.
 * We can record which blocks on each device are 'bad' and so just
@@ -7958,9 +7615,9 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
                   sector_t *first_bad, int *bad_sectors)
 {
        int hi;
-        int lo;
+        int lo = 0;
        u64 *p = bb->page;
-        int rv;
+        int rv = 0;
        sector_t target = s + sectors;
        unsigned seq;
@@ -7975,8 +7632,7 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
 retry:
        seq = read_seqbegin(&bb->lock);
-        lo = 0;
-        rv = 0;
        hi = bb->count;
        /* Binary search between lo and hi for 'target'
@@ -8175,19 +7831,13 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
        return rv;
 }
-int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
-                       int is_new)
+                       int acknowledged)
 {
-        int rv;
+        int rv = md_set_badblocks(&rdev->badblocks,
-        if (is_new)
+                                  s + rdev->data_offset, sectors, acknowledged);
-                s += rdev->new_data_offset;
-        else
-                s += rdev->data_offset;
-        rv = md_set_badblocks(&rdev->badblocks,
-                              s, sectors, 0);
        if (rv) {
                /* Make sure they get written out promptly */
-                sysfs_notify_dirent_safe(rdev->sysfs_state);
                set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
                md_wakeup_thread(rdev->mddev->thread);
        }
@@ -8290,15 +7940,11 @@ out:
        return rv;
 }
-int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors)
-                         int is_new)
 {
-        if (is_new)
-                s += rdev->new_data_offset;
-        else
-                s += rdev->data_offset;
        return md_clear_badblocks(&rdev->badblocks,
-                                  s, sectors);
+                                  s + rdev->data_offset,
+                                  sectors);
 }
 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
@@ -8314,7 +7960,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
                return;
        write_seqlock_irq(&bb->lock);
-        if (bb->changed == 0 && bb->unacked_exist) {
+        if (bb->changed == 0) {
                u64 *p = bb->page;
                int i;
                for (i = 0; i < bb->count ; i++) {
@@ -8428,27 +8074,29 @@ static int md_notify_reboot(struct notifier_block *this,
                            unsigned long code, void *x)
 {
        struct list_head *tmp;
-        struct mddev *mddev;
+        mddev_t *mddev;
-        int need_delay = 0;
-        for_each_mddev(mddev, tmp) {
+        if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
-                if (mddev_trylock(mddev)) {
-                        if (mddev->pers)
+                printk(KERN_INFO "md: stopping all md devices.\n");
-                                __md_stop_writes(mddev);
-                        mddev->safemode = 2;
-                        mddev_unlock(mddev);
-                }
-                need_delay = 1;
-        }
-        /*
-         * certain more exotic SCSI devices are known to be
-         * volatile wrt too early system reboots. While the
-         * right place to handle this issue is the given
-         * driver, we do want to have a safe RAID driver ...
-         */
-        if (need_delay)
-                mdelay(1000*1);
+                for_each_mddev(mddev, tmp)
+                        if (mddev_trylock(mddev)) {
+                                /* Force a switch to readonly even array
+                                 * appears to still be in use.  Hence
+                                 * the '100'.
+                                 */
+                                md_set_readonly(mddev, 100);
+                                mddev_unlock(mddev);
+                        }
+                /*
+                 * certain more exotic SCSI devices are known to be
+                 * volatile wrt too early system reboots. While the
+                 * right place to handle this issue is the given
+                 * driver, we do want to have a safe RAID driver ...
+                 */
+                mdelay(1000*1);
+        }
        return NOTIFY_DONE;
 }
@@ -8460,7 +8108,7 @@ static struct notifier_block md_notifier = {
 static void md_geninit(void)
 {
-        pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+        dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
        proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
 }
@@ -8535,7 +8183,7 @@ void md_autodetect_dev(dev_t dev)
 static void autostart_arrays(int part)
 {
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        struct detected_devices_node *node_detected_dev;
        dev_t dev;
        int i_scanned, i_passed;
@@ -8575,7 +8223,7 @@ static void autostart_arrays(int part)
 static __exit void md_exit(void)
 {
-        struct mddev *mddev;
+        mddev_t *mddev;
        struct list_head *tmp;
        blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index eca59c3074e..0a309dc29b4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -1,5 +1,5 @@
 /*
-   md.h : kernel internal structure of the Linux MD driver
+   md_k.h : kernel internal structure of the Linux MD driver
          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
          
   This program is free software; you can redistribute it and/or modify
@@ -26,6 +26,9 @@
 #define MaxSector (~(sector_t)0)
+typedef struct mddev_s mddev_t;
+typedef struct mdk_rdev_s mdk_rdev_t;
 /* Bad block numbers are stored sorted in a single page.
 * 64bits is used for each block or extent.
 * 54 bits are sector number, 9 bits are extent size,
@@ -36,11 +39,12 @@
 /*
 * MD's 'extended' device
 */
-struct md_rdev {
+struct mdk_rdev_s
+{
        struct list_head same_set;      /* RAID devices within the same set */
        sector_t sectors;               /* Device size (in 512bytes sectors) */
-        struct mddev *mddev;            /* RAID array if running */
+        mddev_t *mddev;                 /* RAID array if running */
        int last_events;                /* IO event timestamp */
        /*
@@ -55,7 +59,6 @@ struct md_rdev {
        int             sb_loaded;
        __u64           sb_events;
        sector_t        data_offset;    /* start of data in array */
-        sector_t        new_data_offset;/* only relevant while reshaping */
        sector_t        sb_start;       /* offset of the super block (in 512byte sectors) */
        int             sb_size;        /* bytes in the superblock */
        int             preferred_minor;        /* autorun support */
@@ -73,7 +76,34 @@ struct md_rdev {
         * This reduces the burden of testing multiple flags in many cases
         */
-        unsigned long   flags;  /* bit set of 'enum flag_bits' bits. */
+        unsigned long   flags;
+#define Faulty          1               /* device is known to have a fault */
+#define In_sync         2               /* device is in_sync with rest of array */
+#define WriteMostly     4               /* Avoid reading if at all possible */
+#define AutoDetected    7               /* added by auto-detect */
+#define Blocked         8               /* An error occurred but has not yet
+                                         * been acknowledged by the metadata
+                                         * handler, so don't allow writes
+                                         * until it is cleared */
+#define WriteErrorSeen  9               /* A write error has been seen on this
+                                         * device
+                                         */
+#define FaultRecorded   10              /* Intermediate state for clearing
+                                         * Blocked.  The Fault is/will-be
+                                         * recorded in the metadata, but that
+                                         * metadata hasn't been stored safely
+                                         * on disk yet.
+                                         */
+#define BlockedBadBlocks 11             /* A writer is blocked because they
+                                         * found an unacknowledged bad-block.
+                                         * This can safely be cleared at any
+                                         * time, and the writer will re-check.
+                                         * It may be set at any time, and at
+                                         * worst the writer will timeout and
+                                         * re-check.  So setting it as
+                                         * accurately as possible is good, but
+                                         * not absolutely critical.
+                                         */
        wait_queue_head_t blocked_wait;
        int desc_nr;                    /* descriptor index in the superblock */
@@ -126,48 +156,6 @@ struct md_rdev {
                sector_t size;          /* in sectors */
        } badblocks;
 };
-enum flag_bits {
-        Faulty,                 /* device is known to have a fault */
-        In_sync,                /* device is in_sync with rest of array */
-        Unmerged,               /* device is being added to array and should
-                                 * be considerred for bvec_merge_fn but not
-                                 * yet for actual IO
-                                 */
-        WriteMostly,            /* Avoid reading if at all possible */
-        AutoDetected,           /* added by auto-detect */
-        Blocked,                /* An error occurred but has not yet
-                                 * been acknowledged by the metadata
-                                 * handler, so don't allow writes
-                                 * until it is cleared */
-        WriteErrorSeen,         /* A write error has been seen on this
-                                 * device
-                                 */
-        FaultRecorded,          /* Intermediate state for clearing
-                                 * Blocked.  The Fault is/will-be
-                                 * recorded in the metadata, but that
-                                 * metadata hasn't been stored safely
-                                 * on disk yet.
-                                 */
-        BlockedBadBlocks,       /* A writer is blocked because they
-                                 * found an unacknowledged bad-block.
-                                 * This can safely be cleared at any
-                                 * time, and the writer will re-check.
-                                 * It may be set at any time, and at
-                                 * worst the writer will timeout and
-                                 * re-check.  So setting it as
-                                 * accurately as possible is good, but
-                                 * not absolutely critical.
-                                 */
-        WantReplacement,        /* This device is a candidate to be
-                                 * hot-replaced, either because it has
-                                 * reported some faults, or because
-                                 * of explicit request.
-                                 */
-        Replacement,            /* This device is a replacement for
-                                 * a want_replacement device with same
-                                 * raid_disk number.
-                                 */
-};
 #define BB_LEN_MASK     (0x00000000000001FFULL)
 #define BB_OFFSET_MASK  (0x7FFFFFFFFFFFFE00ULL)
@@ -180,7 +168,7 @@ enum flag_bits {
 extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
                          sector_t *first_bad, int *bad_sectors);
-static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
+static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors,
                              sector_t *first_bad, int *bad_sectors)
 {
        if (unlikely(rdev->badblocks.count)) {
@@ -193,15 +181,15 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
        }
        return 0;
 }
-extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
-                              int is_new);
+                              int acknowledged);
-extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors);
-                                int is_new);
 extern void md_ack_all_badblocks(struct badblocks *bb);
-struct mddev {
+struct mddev_s
+{
        void                            *private;
-        struct md_personality           *pers;
+        struct mdk_personality          *pers;
        dev_t                           unit;
        int                             md_minor;
        struct list_head                disks;
@@ -264,10 +252,12 @@ struct mddev {
        sector_t                        reshape_position;
        int                             delta_disks, new_level, new_layout;
        int                             new_chunk_sectors;
-        int                             reshape_backwards;
-        struct md_thread                *thread;        /* management thread */
+        atomic_t                        plug_cnt;       /* If device is expecting
-        struct md_thread                *sync_thread;   /* doing resync or reconstruct */
+                                                         * more bios soon.
+                                                         */
+        struct mdk_thread_s             *thread;        /* management thread */
+        struct mdk_thread_s             *sync_thread;   /* doing resync or reconstruct */
        sector_t                        curr_resync;    /* last block scheduled */
        /* As resync requests can complete out of order, we cannot easily track
         * how much resync has been completed.  So we occasionally pause until
@@ -282,7 +272,7 @@ struct mddev {
        sector_t                        resync_max_sectors; /* may be set by personality */
-        atomic64_t                      resync_mismatches; /* count of sectors where
+        sector_t                        resync_mismatches; /* count of sectors where
                                                            * parity/replica mismatch found
                                                            */
@@ -307,7 +297,6 @@ struct mddev {
         * REQUEST:  user-space has requested a sync (used with SYNC)
         * CHECK:    user-space request for check-only, no repair
         * RESHAPE:  A reshape is happening
-         * ERROR:    sync-action interrupted because io-error
         *
         * If neither SYNC or RESHAPE are set, then it is a recovery.
         */
@@ -321,7 +310,6 @@ struct mddev {
 #define MD_RECOVERY_CHECK       7
 #define MD_RECOVERY_RESHAPE     8
 #define MD_RECOVERY_FROZEN      9
-#define MD_RECOVERY_ERROR       10
        unsigned long                   recovery;
        /* If a RAID personality determines that recovery (of a particular
@@ -351,10 +339,6 @@ struct mddev {
        int                             degraded;       /* whether md should consider
                                                         * adding a spare
                                                         */
-        int                             merge_check_needed; /* at least one
-                                                             * member device
-                                                             * has a
-                                                             * merge_bvec_fn */
        atomic_t                        recovery_active; /* blocks scheduled, but not written */
        wait_queue_head_t               recovery_wait;
@@ -392,13 +376,10 @@ struct mddev {
                                                 * For external metadata, offset
                                                 * from start of device. 
                                                 */
-                unsigned long           space; /* space available at this offset */
                loff_t                  default_offset; /* this is the offset to use when
                                                         * hot-adding a bitmap.  It should
                                                         * eventually be settable by sysfs.
                                                         */
-                unsigned long           default_space; /* space available at
-                                                        * default offset */
                struct mutex            mutex;
                unsigned long           chunksize;
                unsigned long           daemon_sleep; /* how many jiffies between updates? */
@@ -421,11 +402,11 @@ struct mddev {
        atomic_t flush_pending;
        struct work_struct flush_work;
        struct work_struct event_work;  /* used by dm to report failure event */
-        void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
+        void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
 };
-static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
+static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
 {
        int faulty = test_bit(Faulty, &rdev->flags);
        if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
@@ -437,35 +418,35 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect
        atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
 }
-struct md_personality
+struct mdk_personality
 {
        char *name;
        int level;
        struct list_head list;
        struct module *owner;
-        void (*make_request)(struct mddev *mddev, struct bio *bio);
+        int (*make_request)(mddev_t *mddev, struct bio *bio);
-        int (*run)(struct mddev *mddev);
+        int (*run)(mddev_t *mddev);
-        int (*stop)(struct mddev *mddev);
+        int (*stop)(mddev_t *mddev);
-        void (*status)(struct seq_file *seq, struct mddev *mddev);
+        void (*status)(struct seq_file *seq, mddev_t *mddev);
        /* error_handler must set ->faulty and clear ->in_sync
         * if appropriate, and should abort recovery if needed 
         */
-        void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
+        void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
-        int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
+        int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
-        int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
+        int (*hot_remove_disk) (mddev_t *mddev, int number);
-        int (*spare_active) (struct mddev *mddev);
+        int (*spare_active) (mddev_t *mddev);
-        sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster);
+        sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
-        int (*resize) (struct mddev *mddev, sector_t sectors);
+        int (*resize) (mddev_t *mddev, sector_t sectors);
-        sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks);
+        sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
-        int (*check_reshape) (struct mddev *mddev);
+        int (*check_reshape) (mddev_t *mddev);
-        int (*start_reshape) (struct mddev *mddev);
+        int (*start_reshape) (mddev_t *mddev);
-        void (*finish_reshape) (struct mddev *mddev);
+        void (*finish_reshape) (mddev_t *mddev);
        /* quiesce moves between quiescence states
         * 0 - fully active
         * 1 - no new requests allowed
         * others - reserved
         */
-        void (*quiesce) (struct mddev *mddev, int state);
+        void (*quiesce) (mddev_t *mddev, int state);
        /* takeover is used to transition an array from one
         * personality to another.  The new personality must be able
         * to handle the data in the current layout.
@@ -475,14 +456,14 @@ struct md_personality
         * This needs to be installed and then ->run used to activate the
         * array.
         */
-        void *(*takeover) (struct mddev *mddev);
+        void *(*takeover) (mddev_t *mddev);
 };
 struct md_sysfs_entry {
        struct attribute attr;
-        ssize_t (*show)(struct mddev *, char *);
+        ssize_t (*show)(mddev_t *, char *);
-        ssize_t (*store)(struct mddev *, const char *, size_t);
+        ssize_t (*store)(mddev_t *, const char *, size_t);
 };
 extern struct attribute_group md_bitmap_group;
@@ -498,28 +479,23 @@ static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd)
                sysfs_notify_dirent(sd);
 }
-static inline char * mdname (struct mddev * mddev)
+static inline char * mdname (mddev_t * mddev)
 {
        return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
 }
-static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
+static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        char nm[20];
-        if (!test_bit(Replacement, &rdev->flags)) {
+        sprintf(nm, "rd%d", rdev->raid_disk);
-                sprintf(nm, "rd%d", rdev->raid_disk);
+        return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
-                return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
-        } else
-                return 0;
 }
-static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
+static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        char nm[20];
-        if (!test_bit(Replacement, &rdev->flags)) {
+        sprintf(nm, "rd%d", rdev->raid_disk);
-                sprintf(nm, "rd%d", rdev->raid_disk);
+        sysfs_remove_link(&mddev->kobj, nm);
-                sysfs_remove_link(&mddev->kobj, nm);
-        }
 }
 /*
@@ -532,84 +508,96 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
 /*
 * iterates through the 'same array disks' ringlist
 */
-#define rdev_for_each(rdev, mddev)                              \
+#define rdev_for_each(rdev, tmp, mddev)                         \
-        list_for_each_entry(rdev, &((mddev)->disks), same_set)
-#define rdev_for_each_safe(rdev, tmp, mddev)                            \
        list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
 #define rdev_for_each_rcu(rdev, mddev)                          \
        list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
-struct md_thread {
+typedef struct mdk_thread_s {
-        void                    (*run) (struct md_thread *thread);
+        void                    (*run) (mddev_t *mddev);
-        struct mddev            *mddev;
+        mddev_t                 *mddev;
        wait_queue_head_t       wqueue;
        unsigned long           flags;
        struct task_struct      *tsk;
        unsigned long           timeout;
-        void                    *private;
+} mdk_thread_t;
-};
 #define THREAD_WAKEUP  0
+#define __wait_event_lock_irq(wq, condition, lock, cmd)                 \
+do {                                                                    \
+        wait_queue_t __wait;                                            \
+        init_waitqueue_entry(&__wait, current);                         \
+                                                                        \
+        add_wait_queue(&wq, &__wait);                                   \
+        for (;;) {                                                      \
+                set_current_state(TASK_UNINTERRUPTIBLE);                \
+                if (condition)                                          \
+                        break;                                          \
+                spin_unlock_irq(&lock);                                 \
+                cmd;                                                    \
+                schedule();                                             \
+                spin_lock_irq(&lock);                                   \
+        }                                                               \
+        current->state = TASK_RUNNING;                                  \
+        remove_wait_queue(&wq, &__wait);                                \
+} while (0)
+#define wait_event_lock_irq(wq, condition, lock, cmd)                   \
+do {                                                                    \
+        if (condition)                                                  \
+                break;                                                  \
+        __wait_event_lock_irq(wq, condition, lock, cmd);                \
+} while (0)
 static inline void safe_put_page(struct page *p)
 {
        if (p) put_page(p);
 }
-extern int register_md_personality(struct md_personality *p);
+extern int register_md_personality(struct mdk_personality *p);
-extern int unregister_md_personality(struct md_personality *p);
+extern int unregister_md_personality(struct mdk_personality *p);
-extern struct md_thread *md_register_thread(
+extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
-        void (*run)(struct md_thread *thread),
+                                mddev_t *mddev, const char *name);
-        struct mddev *mddev,
+extern void md_unregister_thread(mdk_thread_t **threadp);
-        const char *name);
+extern void md_wakeup_thread(mdk_thread_t *thread);
-extern void md_unregister_thread(struct md_thread **threadp);
+extern void md_check_recovery(mddev_t *mddev);
-extern void md_wakeup_thread(struct md_thread *thread);
+extern void md_write_start(mddev_t *mddev, struct bio *bi);
-extern void md_check_recovery(struct mddev *mddev);
+extern void md_write_end(mddev_t *mddev);
-extern void md_write_start(struct mddev *mddev, struct bio *bi);
+extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
-extern void md_write_end(struct mddev *mddev);
+extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
-extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
-extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
+extern int mddev_congested(mddev_t *mddev, int bits);
-extern void md_finish_reshape(struct mddev *mddev);
+extern void md_flush_request(mddev_t *mddev, struct bio *bio);
+extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
-extern int mddev_congested(struct mddev *mddev, int bits);
-extern void md_flush_request(struct mddev *mddev, struct bio *bio);
-extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
                           sector_t sector, int size, struct page *page);
-extern void md_super_wait(struct mddev *mddev);
+extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 
+extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, 
                        struct page *page, int rw, bool metadata_op);
-extern void md_do_sync(struct md_thread *thread);
+extern void md_do_sync(mddev_t *mddev);
-extern void md_new_event(struct mddev *mddev);
+extern void md_new_event(mddev_t *mddev);
-extern int md_allow_write(struct mddev *mddev);
+extern int md_allow_write(mddev_t *mddev);
-extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
+extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
-extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
+extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
-extern int md_check_no_bitmap(struct mddev *mddev);
+extern int md_check_no_bitmap(mddev_t *mddev);
-extern int md_integrity_register(struct mddev *mddev);
+extern int md_integrity_register(mddev_t *mddev);
-extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
+extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
 extern void restore_bitmap_write_access(struct file *file);
-extern void mddev_init(struct mddev *mddev);
+extern void mddev_init(mddev_t *mddev);
-extern int md_run(struct mddev *mddev);
+extern int md_run(mddev_t *mddev);
-extern void md_stop(struct mddev *mddev);
+extern void md_stop(mddev_t *mddev);
-extern void md_stop_writes(struct mddev *mddev);
+extern void md_stop_writes(mddev_t *mddev);
-extern int md_rdev_init(struct md_rdev *rdev);
+extern int md_rdev_init(mdk_rdev_t *rdev);
-extern void md_rdev_clear(struct md_rdev *rdev);
-extern void mddev_suspend(struct mddev *mddev);
+extern void mddev_suspend(mddev_t *mddev);
-extern void mddev_resume(struct mddev *mddev);
+extern void mddev_resume(mddev_t *mddev);
 extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
-                                   struct mddev *mddev);
+                                   mddev_t *mddev);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
-                                   struct mddev *mddev);
+                                   mddev_t *mddev);
+extern int mddev_check_plugged(mddev_t *mddev);
 extern void md_trim_bio(struct bio *bio, int offset, int size);
-extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
-static inline int mddev_check_plugged(struct mddev *mddev)
-{
-        return !!blk_check_plugged(md_unplug, mddev,
-                                   sizeof(struct blk_plug_cb));
-}
 #endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 1642eae75a3..d5b5fb30017 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -20,7 +20,6 @@
 */
 #include <linux/blkdev.h>
-#include <linux/module.h>
 #include <linux/raid/md_u.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
@@ -32,7 +31,7 @@
 #define NR_RESERVED_BUFS        32
-static int multipath_map (struct mpconf *conf)
+static int multipath_map (multipath_conf_t *conf)
 {
        int i, disks = conf->raid_disks;
@@ -43,7 +42,7 @@ static int multipath_map (struct mpconf *conf)
        rcu_read_lock();
        for (i = 0; i < disks; i++) {
-                struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
+                mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
                if (rdev && test_bit(In_sync, &rdev->flags)) {
                        atomic_inc(&rdev->nr_pending);
                        rcu_read_unlock();
@@ -59,8 +58,8 @@ static int multipath_map (struct mpconf *conf)
 static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
 {
        unsigned long flags;
-        struct mddev *mddev = mp_bh->mddev;
+        mddev_t *mddev = mp_bh->mddev;
-        struct mpconf *conf = mddev->private;
+        multipath_conf_t *conf = mddev->private;
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&mp_bh->retry_list, &conf->retry_list);
@@ -77,7 +76,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
 static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
 {
        struct bio *bio = mp_bh->master_bio;
-        struct mpconf *conf = mp_bh->mddev->private;
+        multipath_conf_t *conf = mp_bh->mddev->private;
        bio_endio(bio, err);
        mempool_free(mp_bh, conf->pool);
@@ -87,8 +86,8 @@ static void multipath_end_request(struct bio *bio, int error)
 {
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        struct multipath_bh *mp_bh = bio->bi_private;
-        struct mpconf *conf = mp_bh->mddev->private;
+        multipath_conf_t *conf = mp_bh->mddev->private;
-        struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
+        mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
        if (uptodate)
                multipath_end_bh_io(mp_bh, 0);
@@ -107,15 +106,15 @@ static void multipath_end_request(struct bio *bio, int error)
        rdev_dec_pending(rdev, conf->mddev);
 }
-static void multipath_make_request(struct mddev *mddev, struct bio * bio)
+static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 {
-        struct mpconf *conf = mddev->private;
+        multipath_conf_t *conf = mddev->private;
        struct multipath_bh * mp_bh;
        struct multipath_info *multipath;
        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
                md_flush_request(mddev, bio);
-                return;
+                return 0;
        }
        mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
@@ -127,7 +126,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
        if (mp_bh->path < 0) {
                bio_endio(bio, -EIO);
                mempool_free(mp_bh, conf->pool);
-                return;
+                return 0;
        }
        multipath = conf->multipaths + mp_bh->path;
@@ -138,12 +137,12 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
        mp_bh->bio.bi_end_io = multipath_end_request;
        mp_bh->bio.bi_private = mp_bh;
        generic_make_request(&mp_bh->bio);
-        return;
+        return 0;
 }
-static void multipath_status (struct seq_file *seq, struct mddev *mddev)
+static void multipath_status (struct seq_file *seq, mddev_t *mddev)
 {
-        struct mpconf *conf = mddev->private;
+        multipath_conf_t *conf = mddev->private;
        int i;
        
        seq_printf (seq, " [%d/%d] [", conf->raid_disks,
@@ -157,8 +156,8 @@ static void multipath_status (struct seq_file *seq, struct mddev *mddev)
 static int multipath_congested(void *data, int bits)
 {
-        struct mddev *mddev = data;
+        mddev_t *mddev = data;
-        struct mpconf *conf = mddev->private;
+        multipath_conf_t *conf = mddev->private;
        int i, ret = 0;
        if (mddev_congested(mddev, bits))
@@ -166,7 +165,7 @@ static int multipath_congested(void *data, int bits)
        rcu_read_lock();
        for (i = 0; i < mddev->raid_disks ; i++) {
-                struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
+                mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
                if (rdev && !test_bit(Faulty, &rdev->flags)) {
                        struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -184,9 +183,9 @@ static int multipath_congested(void *data, int bits)
 /*
 * Careful, this can execute in IRQ contexts as well!
 */
-static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
+static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
 {
-        struct mpconf *conf = mddev->private;
+        multipath_conf_t *conf = mddev->private;
        char b[BDEVNAME_SIZE];
        if (conf->raid_disks - mddev->degraded <= 1) {
@@ -219,7 +218,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
               conf->raid_disks - mddev->degraded);
 }
-static void print_multipath_conf (struct mpconf *conf)
+static void print_multipath_conf (multipath_conf_t *conf)
 {
        int i;
        struct multipath_info *tmp;
@@ -243,9 +242,9 @@ static void print_multipath_conf (struct mpconf *conf)
 }
-static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
+static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
-        struct mpconf *conf = mddev->private;
+        multipath_conf_t *conf = mddev->private;
        struct request_queue *q;
        int err = -EEXIST;
        int path;
@@ -292,16 +291,17 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        return err;
 }
-static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
+static int multipath_remove_disk(mddev_t *mddev, int number)
 {
-        struct mpconf *conf = mddev->private;
+        multipath_conf_t *conf = mddev->private;
        int err = 0;
-        int number = rdev->raid_disk;
+        mdk_rdev_t *rdev;
        struct multipath_info *p = conf->multipaths + number;
        print_multipath_conf(conf);
-        if (rdev == p->rdev) {
+        rdev = p->rdev;
+        if (rdev) {
                if (test_bit(In_sync, &rdev->flags) ||
                    atomic_read(&rdev->nr_pending)) {
                        printk(KERN_ERR "hot-remove-disk, slot %d is identified"
@@ -335,13 +335,12 @@ abort:
 *      3.      Performs writes following reads for array syncronising.
 */
-static void multipathd(struct md_thread *thread)
+static void multipathd (mddev_t *mddev)
 {
-        struct mddev *mddev = thread->mddev;
        struct multipath_bh *mp_bh;
        struct bio *bio;
        unsigned long flags;
-        struct mpconf *conf = mddev->private;
+        multipath_conf_t *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
        md_check_recovery(mddev);
@@ -380,7 +379,7 @@ static void multipathd(struct md_thread *thread)
        spin_unlock_irqrestore(&conf->device_lock, flags);
 }
-static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
        WARN_ONCE(sectors || raid_disks,
                  "%s does not support generic reshape\n", __func__);
@@ -388,12 +387,12 @@ static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_d
        return mddev->dev_sectors;
 }
-static int multipath_run (struct mddev *mddev)
+static int multipath_run (mddev_t *mddev)
 {
-        struct mpconf *conf;
+        multipath_conf_t *conf;
        int disk_idx;
        struct multipath_info *disk;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        int working_disks;
        if (md_check_no_bitmap(mddev))
@@ -410,7 +409,7 @@ static int multipath_run (struct mddev *mddev)
         * should be freed in multipath_stop()]
         */
-        conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
+        conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
        mddev->private = conf;
        if (!conf) {
                printk(KERN_ERR 
@@ -429,7 +428,7 @@ static int multipath_run (struct mddev *mddev)
        }
        working_disks = 0;
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                disk_idx = rdev->raid_disk;
                if (disk_idx < 0 ||
                    disk_idx >= mddev->raid_disks)
@@ -475,8 +474,7 @@ static int multipath_run (struct mddev *mddev)
        }
        {
-                mddev->thread = md_register_thread(multipathd, mddev,
+                mddev->thread = md_register_thread(multipathd, mddev, NULL);
-                                                   "multipath");
                if (!mddev->thread) {
                        printk(KERN_ERR "multipath: couldn't allocate thread"
                                " for %s\n", mdname(mddev));
@@ -512,9 +510,9 @@ out:
 }
-static int multipath_stop (struct mddev *mddev)
+static int multipath_stop (mddev_t *mddev)
 {
-        struct mpconf *conf = mddev->private;
+        multipath_conf_t *conf = mddev->private;
        md_unregister_thread(&mddev->thread);
        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -525,7 +523,7 @@ static int multipath_stop (struct mddev *mddev)
        return 0;
 }
-static struct md_personality multipath_personality =
+static struct mdk_personality multipath_personality =
 {
        .name           = "multipath",
        .level          = LEVEL_MULTIPATH,
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index 717c60f6289..3c5a45eb5f8 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -2,11 +2,11 @@
 #define _MULTIPATH_H
 struct multipath_info {
-        struct md_rdev  *rdev;
+        mdk_rdev_t      *rdev;
 };
-struct mpconf {
+struct multipath_private_data {
-        struct mddev                    *mddev;
+        mddev_t                 *mddev;
        struct multipath_info   *multipaths;
        int                     raid_disks;
        spinlock_t              device_lock;
@@ -15,6 +15,8 @@ struct mpconf {
        mempool_t               *pool;
 };
+typedef struct multipath_private_data multipath_conf_t;
 /*
 * this is our 'private' 'collective' MULTIPATH buffer head.
 * it contains information about what kind of IO operations were started
@@ -22,7 +24,7 @@ struct mpconf {
 */
 struct multipath_bh {
-        struct mddev                    *mddev;
+        mddev_t                 *mddev;
        struct bio              *master_bio;
        struct bio              bio;
        int                     path;
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
deleted file mode 100644
index ceb359050a5..00000000000
--- a/drivers/md/persistent-data/Kconfig
+++ /dev/null
@@ -1,8 +0,0 @@
-config DM_PERSISTENT_DATA
-       tristate
-       depends on BLK_DEV_DM && EXPERIMENTAL
-       select LIBCRC32C
-       select DM_BUFIO
-       ---help---
-         Library providing immutable on-disk data structure support for
-         device-mapper targets such as the thin provisioning target.
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile
deleted file mode 100644
index d8e7cb767c1..00000000000
--- a/drivers/md/persistent-data/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
-dm-persistent-data-objs := \
-        dm-block-manager.o \
-        dm-space-map-common.o \
-        dm-space-map-disk.o \
-        dm-space-map-metadata.o \
-        dm-transaction-manager.o \
-        dm-btree.o \
-        dm-btree-remove.o \
-        dm-btree-spine.o
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
deleted file mode 100644
index 28c3ed072a7..00000000000
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ /dev/null
@@ -1,635 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#include "dm-block-manager.h"
-#include "dm-persistent-data-internal.h"
-#include "../dm-bufio.h"
-#include <linux/crc32c.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/rwsem.h>
-#include <linux/device-mapper.h>
-#include <linux/stacktrace.h>
-#define DM_MSG_PREFIX "block manager"
-/*----------------------------------------------------------------*/
-/*
- * This is a read/write semaphore with a couple of differences.
- *
- * i) There is a restriction on the number of concurrent read locks that
- * may be held at once.  This is just an implementation detail.
- *
- * ii) Recursive locking attempts are detected and return EINVAL.  A stack
- * trace is also emitted for the previous lock acquisition.
- *
- * iii) Priority is given to write locks.
- */
-#define MAX_HOLDERS 4
-#define MAX_STACK 10
-typedef unsigned long stack_entries[MAX_STACK];
-struct block_lock {
-        spinlock_t lock;
-        __s32 count;
-        struct list_head waiters;
-        struct task_struct *holders[MAX_HOLDERS];
-#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
-        struct stack_trace traces[MAX_HOLDERS];
-        stack_entries entries[MAX_HOLDERS];
-#endif
-};
-struct waiter {
-        struct list_head list;
-        struct task_struct *task;
-        int wants_write;
-};
-static unsigned __find_holder(struct block_lock *lock,
-                              struct task_struct *task)
-{
-        unsigned i;
-        for (i = 0; i < MAX_HOLDERS; i++)
-                if (lock->holders[i] == task)
-                        break;
-        BUG_ON(i == MAX_HOLDERS);
-        return i;
-}
-/* call this *after* you increment lock->count */
-static void __add_holder(struct block_lock *lock, struct task_struct *task)
-{
-        unsigned h = __find_holder(lock, NULL);
-#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
-        struct stack_trace *t;
-#endif
-        get_task_struct(task);
-        lock->holders[h] = task;
-#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
-        t = lock->traces + h;
-        t->nr_entries = 0;
-        t->max_entries = MAX_STACK;
-        t->entries = lock->entries[h];
-        t->skip = 2;
-        save_stack_trace(t);
-#endif
-}
-/* call this *before* you decrement lock->count */
-static void __del_holder(struct block_lock *lock, struct task_struct *task)
-{
-        unsigned h = __find_holder(lock, task);
-        lock->holders[h] = NULL;
-        put_task_struct(task);
-}
-static int __check_holder(struct block_lock *lock)
-{
-        unsigned i;
-#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
-        static struct stack_trace t;
-        static stack_entries entries;
-#endif
-        for (i = 0; i < MAX_HOLDERS; i++) {
-                if (lock->holders[i] == current) {
-                        DMERR("recursive lock detected in pool metadata");
-#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
-                        DMERR("previously held here:");
-                        print_stack_trace(lock->traces + i, 4);
-                        DMERR("subsequent acquisition attempted here:");
-                        t.nr_entries = 0;
-                        t.max_entries = MAX_STACK;
-                        t.entries = entries;
-                        t.skip = 3;
-                        save_stack_trace(&t);
-                        print_stack_trace(&t, 4);
-#endif
-                        return -EINVAL;
-                }
-        }
-        return 0;
-}
-static void __wait(struct waiter *w)
-{
-        for (;;) {
-                set_task_state(current, TASK_UNINTERRUPTIBLE);
-                if (!w->task)
-                        break;
-                schedule();
-        }
-        set_task_state(current, TASK_RUNNING);
-}
-static void __wake_waiter(struct waiter *w)
-{
-        struct task_struct *task;
-        list_del(&w->list);
-        task = w->task;
-        smp_mb();
-        w->task = NULL;
-        wake_up_process(task);
-}
-/*
- * We either wake a few readers or a single writer.
- */
-static void __wake_many(struct block_lock *lock)
-{
-        struct waiter *w, *tmp;
-        BUG_ON(lock->count < 0);
-        list_for_each_entry_safe(w, tmp, &lock->waiters, list) {
-                if (lock->count >= MAX_HOLDERS)
-                        return;
-                if (w->wants_write) {
-                        if (lock->count > 0)
-                                return; /* still read locked */
-                        lock->count = -1;
-                        __add_holder(lock, w->task);
-                        __wake_waiter(w);
-                        return;
-                }
-                lock->count++;
-                __add_holder(lock, w->task);
-                __wake_waiter(w);
-        }
-}
-static void bl_init(struct block_lock *lock)
-{
-        int i;
-        spin_lock_init(&lock->lock);
-        lock->count = 0;
-        INIT_LIST_HEAD(&lock->waiters);
-        for (i = 0; i < MAX_HOLDERS; i++)
-                lock->holders[i] = NULL;
-}
-static int __available_for_read(struct block_lock *lock)
-{
-        return lock->count >= 0 &&
-                lock->count < MAX_HOLDERS &&
-                list_empty(&lock->waiters);
-}
-static int bl_down_read(struct block_lock *lock)
-{
-        int r;
-        struct waiter w;
-        spin_lock(&lock->lock);
-        r = __check_holder(lock);
-        if (r) {
-                spin_unlock(&lock->lock);
-                return r;
-        }
-        if (__available_for_read(lock)) {
-                lock->count++;
-                __add_holder(lock, current);
-                spin_unlock(&lock->lock);
-                return 0;
-        }
-        get_task_struct(current);
-        w.task = current;
-        w.wants_write = 0;
-        list_add_tail(&w.list, &lock->waiters);
-        spin_unlock(&lock->lock);
-        __wait(&w);
-        put_task_struct(current);
-        return 0;
-}
-static int bl_down_read_nonblock(struct block_lock *lock)
-{
-        int r;
-        spin_lock(&lock->lock);
-        r = __check_holder(lock);
-        if (r)
-                goto out;
-        if (__available_for_read(lock)) {
-                lock->count++;
-                __add_holder(lock, current);
-                r = 0;
-        } else
-                r = -EWOULDBLOCK;
-out:
-        spin_unlock(&lock->lock);
-        return r;
-}
-static void bl_up_read(struct block_lock *lock)
-{
-        spin_lock(&lock->lock);
-        BUG_ON(lock->count <= 0);
-        __del_holder(lock, current);
-        --lock->count;
-        if (!list_empty(&lock->waiters))
-                __wake_many(lock);
-        spin_unlock(&lock->lock);
-}
-static int bl_down_write(struct block_lock *lock)
-{
-        int r;
-        struct waiter w;
-        spin_lock(&lock->lock);
-        r = __check_holder(lock);
-        if (r) {
-                spin_unlock(&lock->lock);
-                return r;
-        }
-        if (lock->count == 0 && list_empty(&lock->waiters)) {
-                lock->count = -1;
-                __add_holder(lock, current);
-                spin_unlock(&lock->lock);
-                return 0;
-        }
-        get_task_struct(current);
-        w.task = current;
-        w.wants_write = 1;
-        /*
-         * Writers given priority. We know there's only one mutator in the
-         * system, so ignoring the ordering reversal.
-         */
-        list_add(&w.list, &lock->waiters);
-        spin_unlock(&lock->lock);
-        __wait(&w);
-        put_task_struct(current);
-        return 0;
-}
-static void bl_up_write(struct block_lock *lock)
-{
-        spin_lock(&lock->lock);
-        __del_holder(lock, current);
-        lock->count = 0;
-        if (!list_empty(&lock->waiters))
-                __wake_many(lock);
-        spin_unlock(&lock->lock);
-}
-static void report_recursive_bug(dm_block_t b, int r)
-{
-        if (r == -EINVAL)
-                DMERR("recursive acquisition of block %llu requested.",
-                      (unsigned long long) b);
-}
-/*----------------------------------------------------------------*/
-/*
- * Block manager is currently implemented using dm-bufio.  struct
- * dm_block_manager and struct dm_block map directly onto a couple of
- * structs in the bufio interface.  I want to retain the freedom to move
- * away from bufio in the future.  So these structs are just cast within
- * this .c file, rather than making it through to the public interface.
- */
-static struct dm_buffer *to_buffer(struct dm_block *b)
-{
-        return (struct dm_buffer *) b;
-}
-dm_block_t dm_block_location(struct dm_block *b)
-{
-        return dm_bufio_get_block_number(to_buffer(b));
-}
-EXPORT_SYMBOL_GPL(dm_block_location);
-void *dm_block_data(struct dm_block *b)
-{
-        return dm_bufio_get_block_data(to_buffer(b));
-}
-EXPORT_SYMBOL_GPL(dm_block_data);
-struct buffer_aux {
-        struct dm_block_validator *validator;
-        struct block_lock lock;
-        int write_locked;
-};
-static void dm_block_manager_alloc_callback(struct dm_buffer *buf)
-{
-        struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
-        aux->validator = NULL;
-        bl_init(&aux->lock);
-}
-static void dm_block_manager_write_callback(struct dm_buffer *buf)
-{
-        struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
-        if (aux->validator) {
-                aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf,
-                         dm_bufio_get_block_size(dm_bufio_get_client(buf)));
-        }
-}
-/*----------------------------------------------------------------
- * Public interface
- *--------------------------------------------------------------*/
-struct dm_block_manager {
-        struct dm_bufio_client *bufio;
-        bool read_only:1;
-};
-struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
-                                                 unsigned block_size,
-                                                 unsigned cache_size,
-                                                 unsigned max_held_per_thread)
-{
-        int r;
-        struct dm_block_manager *bm;
-        bm = kmalloc(sizeof(*bm), GFP_KERNEL);
-        if (!bm) {
-                r = -ENOMEM;
-                goto bad;
-        }
-        bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread,
-                                           sizeof(struct buffer_aux),
-                                           dm_block_manager_alloc_callback,
-                                           dm_block_manager_write_callback);
-        if (IS_ERR(bm->bufio)) {
-                r = PTR_ERR(bm->bufio);
-                kfree(bm);
-                goto bad;
-        }
-        bm->read_only = false;
-        return bm;
-bad:
-        return ERR_PTR(r);
-}
-EXPORT_SYMBOL_GPL(dm_block_manager_create);
-void dm_block_manager_destroy(struct dm_block_manager *bm)
-{
-        dm_bufio_client_destroy(bm->bufio);
-        kfree(bm);
-}
-EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
-unsigned dm_bm_block_size(struct dm_block_manager *bm)
-{
-        return dm_bufio_get_block_size(bm->bufio);
-}
-EXPORT_SYMBOL_GPL(dm_bm_block_size);
-dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
-{
-        return dm_bufio_get_device_size(bm->bufio);
-}
-static int dm_bm_validate_buffer(struct dm_block_manager *bm,
-                                 struct dm_buffer *buf,
-                                 struct buffer_aux *aux,
-                                 struct dm_block_validator *v)
-{
-        if (unlikely(!aux->validator)) {
-                int r;
-                if (!v)
-                        return 0;
-                r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio));
-                if (unlikely(r)) {
-                        DMERR_LIMIT("%s validator check failed for block %llu", v->name,
-                                    (unsigned long long) dm_bufio_get_block_number(buf));
-                        return r;
-                }
-                aux->validator = v;
-        } else {
-                if (unlikely(aux->validator != v)) {
-                        DMERR_LIMIT("validator mismatch (old=%s vs new=%s) for block %llu",
-                                    aux->validator->name, v ? v->name : "NULL",
-                                    (unsigned long long) dm_bufio_get_block_number(buf));
-                        return -EINVAL;
-                }
-        }
-        return 0;
-}
-int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
-                    struct dm_block_validator *v,
-                    struct dm_block **result)
-{
-        struct buffer_aux *aux;
-        void *p;
-        int r;
-        p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
-        if (unlikely(IS_ERR(p)))
-                return PTR_ERR(p);
-        aux = dm_bufio_get_aux_data(to_buffer(*result));
-        r = bl_down_read(&aux->lock);
-        if (unlikely(r)) {
-                dm_bufio_release(to_buffer(*result));
-                report_recursive_bug(b, r);
-                return r;
-        }
-        aux->write_locked = 0;
-        r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
-        if (unlikely(r)) {
-                bl_up_read(&aux->lock);
-                dm_bufio_release(to_buffer(*result));
-                return r;
-        }
-        return 0;
-}
-EXPORT_SYMBOL_GPL(dm_bm_read_lock);
-int dm_bm_write_lock(struct dm_block_manager *bm,
-                     dm_block_t b, struct dm_block_validator *v,
-                     struct dm_block **result)
-{
-        struct buffer_aux *aux;
-        void *p;
-        int r;
-        if (bm->read_only)
-                return -EPERM;
-        p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
-        if (unlikely(IS_ERR(p)))
-                return PTR_ERR(p);
-        aux = dm_bufio_get_aux_data(to_buffer(*result));
-        r = bl_down_write(&aux->lock);
-        if (r) {
-                dm_bufio_release(to_buffer(*result));
-                report_recursive_bug(b, r);
-                return r;
-        }
-        aux->write_locked = 1;
-        r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
-        if (unlikely(r)) {
-                bl_up_write(&aux->lock);
-                dm_bufio_release(to_buffer(*result));
-                return r;
-        }
-        return 0;
-}
-EXPORT_SYMBOL_GPL(dm_bm_write_lock);
-int dm_bm_read_try_lock(struct dm_block_manager *bm,
-                        dm_block_t b, struct dm_block_validator *v,
-                        struct dm_block **result)
-{
-        struct buffer_aux *aux;
-        void *p;
-        int r;
-        p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
-        if (unlikely(IS_ERR(p)))
-                return PTR_ERR(p);
-        if (unlikely(!p))
-                return -EWOULDBLOCK;
-        aux = dm_bufio_get_aux_data(to_buffer(*result));
-        r = bl_down_read_nonblock(&aux->lock);
-        if (r < 0) {
-                dm_bufio_release(to_buffer(*result));
-                report_recursive_bug(b, r);
-                return r;
-        }
-        aux->write_locked = 0;
-        r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
-        if (unlikely(r)) {
-                bl_up_read(&aux->lock);
-                dm_bufio_release(to_buffer(*result));
-                return r;
-        }
-        return 0;
-}
-int dm_bm_write_lock_zero(struct dm_block_manager *bm,
-                          dm_block_t b, struct dm_block_validator *v,
-                          struct dm_block **result)
-{
-        int r;
-        struct buffer_aux *aux;
-        void *p;
-        if (bm->read_only)
-                return -EPERM;
-        p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
-        if (unlikely(IS_ERR(p)))
-                return PTR_ERR(p);
-        memset(p, 0, dm_bm_block_size(bm));
-        aux = dm_bufio_get_aux_data(to_buffer(*result));
-        r = bl_down_write(&aux->lock);
-        if (r) {
-                dm_bufio_release(to_buffer(*result));
-                return r;
-        }
-        aux->write_locked = 1;
-        aux->validator = v;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero);
-int dm_bm_unlock(struct dm_block *b)
-{
-        struct buffer_aux *aux;
-        aux = dm_bufio_get_aux_data(to_buffer(b));
-        if (aux->write_locked) {
-                dm_bufio_mark_buffer_dirty(to_buffer(b));
-                bl_up_write(&aux->lock);
-        } else
-                bl_up_read(&aux->lock);
-        dm_bufio_release(to_buffer(b));
-        return 0;
-}
-EXPORT_SYMBOL_GPL(dm_bm_unlock);
-int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
-                           struct dm_block *superblock)
-{
-        int r;
-        if (bm->read_only)
-                return -EPERM;
-        r = dm_bufio_write_dirty_buffers(bm->bufio);
-        if (unlikely(r)) {
-                dm_bm_unlock(superblock);
-                return r;
-        }
-        dm_bm_unlock(superblock);
-        return dm_bufio_write_dirty_buffers(bm->bufio);
-}
-void dm_bm_set_read_only(struct dm_block_manager *bm)
-{
-        bm->read_only = true;
-}
-EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
-u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
-{
-        return crc32c(~(u32) 0, data, len) ^ init_xor;
-}
-EXPORT_SYMBOL_GPL(dm_bm_checksum);
-/*----------------------------------------------------------------*/
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
-MODULE_DESCRIPTION("Immutable metadata library for dm");
-/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
deleted file mode 100644
index be5bff61be2..00000000000
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#ifndef _LINUX_DM_BLOCK_MANAGER_H
-#define _LINUX_DM_BLOCK_MANAGER_H
-#include <linux/types.h>
-#include <linux/blkdev.h>
-/*----------------------------------------------------------------*/
-/*
- * Block number.
- */
-typedef uint64_t dm_block_t;
-struct dm_block;
-dm_block_t dm_block_location(struct dm_block *b);
-void *dm_block_data(struct dm_block *b);
-/*----------------------------------------------------------------*/
-/*
- * @name should be a unique identifier for the block manager, no longer
- * than 32 chars.
- *
- * @max_held_per_thread should be the maximum number of locks, read or
- * write, that an individual thread holds at any one time.
- */
-struct dm_block_manager;
-struct dm_block_manager *dm_block_manager_create(
-        struct block_device *bdev, unsigned block_size,
-        unsigned cache_size, unsigned max_held_per_thread);
-void dm_block_manager_destroy(struct dm_block_manager *bm);
-unsigned dm_bm_block_size(struct dm_block_manager *bm);
-dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm);
-/*----------------------------------------------------------------*/
-/*
- * The validator allows the caller to verify newly-read data and modify
- * the data just before writing, e.g. to calculate checksums.  It's
- * important to be consistent with your use of validators.  The only time
- * you can change validators is if you call dm_bm_write_lock_zero.
- */
-struct dm_block_validator {
-        const char *name;
-        void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
-        /*
-         * Return 0 if the checksum is valid or < 0 on error.
-         */
-        int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
-};
-/*----------------------------------------------------------------*/
-/*
- * You can have multiple concurrent readers or a single writer holding a
- * block lock.
- */
-/*
- * dm_bm_lock() locks a block and returns through @result a pointer to
- * memory that holds a copy of that block.  If you have write-locked the
- * block then any changes you make to memory pointed to by @result will be
- * written back to the disk sometime after dm_bm_unlock is called.
- */
-int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
-                    struct dm_block_validator *v,
-                    struct dm_block **result);
-int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b,
-                     struct dm_block_validator *v,
-                     struct dm_block **result);
-/*
- * The *_try_lock variants return -EWOULDBLOCK if the block isn't
- * available immediately.
- */
-int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b,
-                        struct dm_block_validator *v,
-                        struct dm_block **result);
-/*
- * Use dm_bm_write_lock_zero() when you know you're going to
- * overwrite the block completely.  It saves a disk read.
- */
-int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
-                          struct dm_block_validator *v,
-                          struct dm_block **result);
-int dm_bm_unlock(struct dm_block *b);
-/*
- * It's a common idiom to have a superblock that should be committed last.
- *
- * @superblock should be write-locked on entry. It will be unlocked during
- * this function.  All dirty blocks are guaranteed to be written and flushed
- * before the superblock.
- *
- * This method always blocks.
- */
-int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
-                           struct dm_block *superblock);
-/*
- * Switches the bm to a read only mode.  Once read-only mode
- * has been entered the following functions will return -EPERM.
- *
- *   dm_bm_write_lock
- *   dm_bm_write_lock_zero
- *   dm_bm_flush_and_unlock
- *
- * Additionally you should not use dm_bm_unlock_move, however no error will
- * be returned if you do.
- */
-void dm_bm_set_read_only(struct dm_block_manager *bm);
-u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
-/*----------------------------------------------------------------*/
-#endif  /* _LINUX_DM_BLOCK_MANAGER_H */
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
deleted file mode 100644
index accbb05f17b..00000000000
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#ifndef DM_BTREE_INTERNAL_H
-#define DM_BTREE_INTERNAL_H
-#include "dm-btree.h"
-/*----------------------------------------------------------------*/
-/*
- * We'll need 2 accessor functions for n->csum and n->blocknr
- * to support dm-btree-spine.c in that case.
- */
-enum node_flags {
-        INTERNAL_NODE = 1,
-        LEAF_NODE = 1 << 1
-};
-/*
- * Every btree node begins with this structure.  Make sure it's a multiple
- * of 8-bytes in size, otherwise the 64bit keys will be mis-aligned.
- */
-struct node_header {
-        __le32 csum;
-        __le32 flags;
-        __le64 blocknr; /* Block this node is supposed to live in. */
-        __le32 nr_entries;
-        __le32 max_entries;
-        __le32 value_size;
-        __le32 padding;
-} __packed;
-struct btree_node {
-        struct node_header header;
-        __le64 keys[0];
-} __packed;
-void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
-                  struct dm_btree_value_type *vt);
-int new_block(struct dm_btree_info *info, struct dm_block **result);
-int unlock_block(struct dm_btree_info *info, struct dm_block *b);
-/*
- * Spines keep track of the rolling locks.  There are 2 variants, read-only
- * and one that uses shadowing.  These are separate structs to allow the
- * type checker to spot misuse, for example accidentally calling read_lock
- * on a shadow spine.
- */
-struct ro_spine {
-        struct dm_btree_info *info;
-        int count;
-        struct dm_block *nodes[2];
-};
-void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
-int exit_ro_spine(struct ro_spine *s);
-int ro_step(struct ro_spine *s, dm_block_t new_child);
-struct btree_node *ro_node(struct ro_spine *s);
-struct shadow_spine {
-        struct dm_btree_info *info;
-        int count;
-        struct dm_block *nodes[2];
-        dm_block_t root;
-};
-void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info);
-int exit_shadow_spine(struct shadow_spine *s);
-int shadow_step(struct shadow_spine *s, dm_block_t b,
-                struct dm_btree_value_type *vt);
-/*
- * The spine must have at least one entry before calling this.
- */
-struct dm_block *shadow_current(struct shadow_spine *s);
-/*
- * The spine must have at least two entries before calling this.
- */
-struct dm_block *shadow_parent(struct shadow_spine *s);
-int shadow_has_parent(struct shadow_spine *s);
-int shadow_root(struct shadow_spine *s);
-/*
- * Some inlines.
- */
-static inline __le64 *key_ptr(struct btree_node *n, uint32_t index)
-{
-        return n->keys + index;
-}
-static inline void *value_base(struct btree_node *n)
-{
-        return &n->keys[le32_to_cpu(n->header.max_entries)];
-}
-static inline void *value_ptr(struct btree_node *n, uint32_t index)
-{
-        uint32_t value_size = le32_to_cpu(n->header.value_size);
-        return value_base(n) + (value_size * index);
-}
-/*
- * Assumes the values are suitably-aligned and converts to core format.
- */
-static inline uint64_t value64(struct btree_node *n, uint32_t index)
-{
-        __le64 *values_le = value_base(n);
-        return le64_to_cpu(values_le[index]);
-}
-/*
- * Searching for a key within a single node.
- */
-int lower_bound(struct btree_node *n, uint64_t key);
-extern struct dm_block_validator btree_node_validator;
-#endif  /* DM_BTREE_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
deleted file mode 100644
index c4f28133ef8..00000000000
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ /dev/null
@@ -1,590 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#include "dm-btree.h"
-#include "dm-btree-internal.h"
-#include "dm-transaction-manager.h"
-#include <linux/export.h>
-/*
- * Removing an entry from a btree
- * ==============================
- *
- * A very important constraint for our btree is that no node, except the
- * root, may have fewer than a certain number of entries.
- * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
- *
- * Ensuring this is complicated by the way we want to only ever hold the
- * locks on 2 nodes concurrently, and only change nodes in a top to bottom
- * fashion.
- *
- * Each node may have a left or right sibling.  When decending the spine,
- * if a node contains only MIN_ENTRIES then we try and increase this to at
- * least MIN_ENTRIES + 1.  We do this in the following ways:
- *
- * [A] No siblings => this can only happen if the node is the root, in which
- *     case we copy the childs contents over the root.
- *
- * [B] No left sibling
- *     ==> rebalance(node, right sibling)
- *
- * [C] No right sibling
- *     ==> rebalance(left sibling, node)
- *
- * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
- *     ==> delete node adding it's contents to left and right
- *
- * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
- *     ==> rebalance(left, node, right)
- *
- * After these operations it's possible that the our original node no
- * longer contains the desired sub tree.  For this reason this rebalancing
- * is performed on the children of the current node.  This also avoids
- * having a special case for the root.
- *
- * Once this rebalancing has occurred we can then step into the child node
- * for internal nodes.  Or delete the entry for leaf nodes.
- */
-/*
- * Some little utilities for moving node data around.
- */
-static void node_shift(struct btree_node *n, int shift)
-{
-        uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
-        uint32_t value_size = le32_to_cpu(n->header.value_size);
-        if (shift < 0) {
-                shift = -shift;
-                BUG_ON(shift > nr_entries);
-                BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
-                memmove(key_ptr(n, 0),
-                        key_ptr(n, shift),
-                        (nr_entries - shift) * sizeof(__le64));
-                memmove(value_ptr(n, 0),
-                        value_ptr(n, shift),
-                        (nr_entries - shift) * value_size);
-        } else {
-                BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
-                memmove(key_ptr(n, shift),
-                        key_ptr(n, 0),
-                        nr_entries * sizeof(__le64));
-                memmove(value_ptr(n, shift),
-                        value_ptr(n, 0),
-                        nr_entries * value_size);
-        }
-}
-static void node_copy(struct btree_node *left, struct btree_node *right, int shift)
-{
-        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
-        uint32_t value_size = le32_to_cpu(left->header.value_size);
-        BUG_ON(value_size != le32_to_cpu(right->header.value_size));
-        if (shift < 0) {
-                shift = -shift;
-                BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
-                memcpy(key_ptr(left, nr_left),
-                       key_ptr(right, 0),
-                       shift * sizeof(__le64));
-                memcpy(value_ptr(left, nr_left),
-                       value_ptr(right, 0),
-                       shift * value_size);
-        } else {
-                BUG_ON(shift > le32_to_cpu(right->header.max_entries));
-                memcpy(key_ptr(right, 0),
-                       key_ptr(left, nr_left - shift),
-                       shift * sizeof(__le64));
-                memcpy(value_ptr(right, 0),
-                       value_ptr(left, nr_left - shift),
-                       shift * value_size);
-        }
-}
-/*
- * Delete a specific entry from a leaf node.
- */
-static void delete_at(struct btree_node *n, unsigned index)
-{
-        unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
-        unsigned nr_to_copy = nr_entries - (index + 1);
-        uint32_t value_size = le32_to_cpu(n->header.value_size);
-        BUG_ON(index >= nr_entries);
-        if (nr_to_copy) {
-                memmove(key_ptr(n, index),
-                        key_ptr(n, index + 1),
-                        nr_to_copy * sizeof(__le64));
-                memmove(value_ptr(n, index),
-                        value_ptr(n, index + 1),
-                        nr_to_copy * value_size);
-        }
-        n->header.nr_entries = cpu_to_le32(nr_entries - 1);
-}
-static unsigned merge_threshold(struct btree_node *n)
-{
-        return le32_to_cpu(n->header.max_entries) / 3;
-}
-struct child {
-        unsigned index;
-        struct dm_block *block;
-        struct btree_node *n;
-};
-static struct dm_btree_value_type le64_type = {
-        .context = NULL,
-        .size = sizeof(__le64),
-        .inc = NULL,
-        .dec = NULL,
-        .equal = NULL
-};
-static int init_child(struct dm_btree_info *info, struct btree_node *parent,
-                      unsigned index, struct child *result)
-{
-        int r, inc;
-        dm_block_t root;
-        result->index = index;
-        root = value64(parent, index);
-        r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
-                               &result->block, &inc);
-        if (r)
-                return r;
-        result->n = dm_block_data(result->block);
-        if (inc)
-                inc_children(info->tm, result->n, &le64_type);
-        *((__le64 *) value_ptr(parent, index)) =
-                cpu_to_le64(dm_block_location(result->block));
-        return 0;
-}
-static int exit_child(struct dm_btree_info *info, struct child *c)
-{
-        return dm_tm_unlock(info->tm, c->block);
-}
-static void shift(struct btree_node *left, struct btree_node *right, int count)
-{
-        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
-        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
-        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
-        uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
-        BUG_ON(max_entries != r_max_entries);
-        BUG_ON(nr_left - count > max_entries);
-        BUG_ON(nr_right + count > max_entries);
-        if (!count)
-                return;
-        if (count > 0) {
-                node_shift(right, count);
-                node_copy(left, right, count);
-        } else {
-                node_copy(left, right, count);
-                node_shift(right, count);
-        }
-        left->header.nr_entries = cpu_to_le32(nr_left - count);
-        right->header.nr_entries = cpu_to_le32(nr_right + count);
-}
-static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
-                         struct child *l, struct child *r)
-{
-        struct btree_node *left = l->n;
-        struct btree_node *right = r->n;
-        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
-        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
-        unsigned threshold = 2 * merge_threshold(left) + 1;
-        if (nr_left + nr_right < threshold) {
-                /*
-                 * Merge
-                 */
-                node_copy(left, right, -nr_right);
-                left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
-                delete_at(parent, r->index);
-                /*
-                 * We need to decrement the right block, but not it's
-                 * children, since they're still referenced by left.
-                 */
-                dm_tm_dec(info->tm, dm_block_location(r->block));
-        } else {
-                /*
-                 * Rebalance.
-                 */
-                unsigned target_left = (nr_left + nr_right) / 2;
-                shift(left, right, nr_left - target_left);
-                *key_ptr(parent, r->index) = right->keys[0];
-        }
-}
-static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
-                      unsigned left_index)
-{
-        int r;
-        struct btree_node *parent;
-        struct child left, right;
-        parent = dm_block_data(shadow_current(s));
-        r = init_child(info, parent, left_index, &left);
-        if (r)
-                return r;
-        r = init_child(info, parent, left_index + 1, &right);
-        if (r) {
-                exit_child(info, &left);
-                return r;
-        }
-        __rebalance2(info, parent, &left, &right);
-        r = exit_child(info, &left);
-        if (r) {
-                exit_child(info, &right);
-                return r;
-        }
-        return exit_child(info, &right);
-}
-/*
- * We dump as many entries from center as possible into left, then the rest
- * in right, then rebalance2.  This wastes some cpu, but I want something
- * simple atm.
- */
-static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
-                               struct child *l, struct child *c, struct child *r,
-                               struct btree_node *left, struct btree_node *center, struct btree_node *right,
-                               uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
-{
-        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
-        unsigned shift = min(max_entries - nr_left, nr_center);
-        BUG_ON(nr_left + shift > max_entries);
-        node_copy(left, center, -shift);
-        left->header.nr_entries = cpu_to_le32(nr_left + shift);
-        if (shift != nr_center) {
-                shift = nr_center - shift;
-                BUG_ON((nr_right + shift) > max_entries);
-                node_shift(right, shift);
-                node_copy(center, right, shift);
-                right->header.nr_entries = cpu_to_le32(nr_right + shift);
-        }
-        *key_ptr(parent, r->index) = right->keys[0];
-        delete_at(parent, c->index);
-        r->index--;
-        dm_tm_dec(info->tm, dm_block_location(c->block));
-        __rebalance2(info, parent, l, r);
-}
-/*
- * Redistributes entries among 3 sibling nodes.
- */
-static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
-                          struct child *l, struct child *c, struct child *r,
-                          struct btree_node *left, struct btree_node *center, struct btree_node *right,
-                          uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
-{
-        int s;
-        uint32_t max_entries = le32_to_cpu(left->header.max_entries);
-        unsigned target = (nr_left + nr_center + nr_right) / 3;
-        BUG_ON(target > max_entries);
-        if (nr_left < nr_right) {
-                s = nr_left - target;
-                if (s < 0 && nr_center < -s) {
-                        /* not enough in central node */
-                        shift(left, center, nr_center);
-                        s = nr_center - target;
-                        shift(left, right, s);
-                        nr_right += s;
-                } else
-                        shift(left, center, s);
-                shift(center, right, target - nr_right);
-        } else {
-                s = target - nr_right;
-                if (s > 0 && nr_center < s) {
-                        /* not enough in central node */
-                        shift(center, right, nr_center);
-                        s = target - nr_center;
-                        shift(left, right, s);
-                        nr_left -= s;
-                } else
-                        shift(center, right, s);
-                shift(left, center, nr_left - target);
-        }
-        *key_ptr(parent, c->index) = center->keys[0];
-        *key_ptr(parent, r->index) = right->keys[0];
-}
-static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
-                         struct child *l, struct child *c, struct child *r)
-{
-        struct btree_node *left = l->n;
-        struct btree_node *center = c->n;
-        struct btree_node *right = r->n;
-        uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
-        uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
-        uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
-        unsigned threshold = merge_threshold(left) * 4 + 1;
-        BUG_ON(left->header.max_entries != center->header.max_entries);
-        BUG_ON(center->header.max_entries != right->header.max_entries);
-        if ((nr_left + nr_center + nr_right) < threshold)
-                delete_center_node(info, parent, l, c, r, left, center, right,
-                                   nr_left, nr_center, nr_right);
-        else
-                redistribute3(info, parent, l, c, r, left, center, right,
-                              nr_left, nr_center, nr_right);
-}
-static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
-                      unsigned left_index)
-{
-        int r;
-        struct btree_node *parent = dm_block_data(shadow_current(s));
-        struct child left, center, right;
-        /*
-         * FIXME: fill out an array?
-         */
-        r = init_child(info, parent, left_index, &left);
-        if (r)
-                return r;
-        r = init_child(info, parent, left_index + 1, &center);
-        if (r) {
-                exit_child(info, &left);
-                return r;
-        }
-        r = init_child(info, parent, left_index + 2, &right);
-        if (r) {
-                exit_child(info, &left);
-                exit_child(info, &center);
-                return r;
-        }
-        __rebalance3(info, parent, &left, &center, &right);
-        r = exit_child(info, &left);
-        if (r) {
-                exit_child(info, &center);
-                exit_child(info, &right);
-                return r;
-        }
-        r = exit_child(info, &center);
-        if (r) {
-                exit_child(info, &right);
-                return r;
-        }
-        r = exit_child(info, &right);
-        if (r)
-                return r;
-        return 0;
-}
-static int get_nr_entries(struct dm_transaction_manager *tm,
-                          dm_block_t b, uint32_t *result)
-{
-        int r;
-        struct dm_block *block;
-        struct btree_node *n;
-        r = dm_tm_read_lock(tm, b, &btree_node_validator, &block);
-        if (r)
-                return r;
-        n = dm_block_data(block);
-        *result = le32_to_cpu(n->header.nr_entries);
-        return dm_tm_unlock(tm, block);
-}
-static int rebalance_children(struct shadow_spine *s,
-                              struct dm_btree_info *info, uint64_t key)
-{
-        int i, r, has_left_sibling, has_right_sibling;
-        uint32_t child_entries;
-        struct btree_node *n;
-        n = dm_block_data(shadow_current(s));
-        if (le32_to_cpu(n->header.nr_entries) == 1) {
-                struct dm_block *child;
-                dm_block_t b = value64(n, 0);
-                r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
-                if (r)
-                        return r;
-                memcpy(n, dm_block_data(child),
-                       dm_bm_block_size(dm_tm_get_bm(info->tm)));
-                r = dm_tm_unlock(info->tm, child);
-                if (r)
-                        return r;
-                dm_tm_dec(info->tm, dm_block_location(child));
-                return 0;
-        }
-        i = lower_bound(n, key);
-        if (i < 0)
-                return -ENODATA;
-        r = get_nr_entries(info->tm, value64(n, i), &child_entries);
-        if (r)
-                return r;
-        has_left_sibling = i > 0;
-        has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
-        if (!has_left_sibling)
-                r = rebalance2(s, info, i);
-        else if (!has_right_sibling)
-                r = rebalance2(s, info, i - 1);
-        else
-                r = rebalance3(s, info, i - 1);
-        return r;
-}
-static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
-{
-        int i = lower_bound(n, key);
-        if ((i < 0) ||
-            (i >= le32_to_cpu(n->header.nr_entries)) ||
-            (le64_to_cpu(n->keys[i]) != key))
-                return -ENODATA;
-        *index = i;
-        return 0;
-}
-/*
- * Prepares for removal from one level of the hierarchy.  The caller must
- * call delete_at() to remove the entry at index.
- */
-static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
-                      struct dm_btree_value_type *vt, dm_block_t root,
-                      uint64_t key, unsigned *index)
-{
-        int i = *index, r;
-        struct btree_node *n;
-        for (;;) {
-                r = shadow_step(s, root, vt);
-                if (r < 0)
-                        break;
-                /*
-                 * We have to patch up the parent node, ugly, but I don't
-                 * see a way to do this automatically as part of the spine
-                 * op.
-                 */
-                if (shadow_has_parent(s)) {
-                        __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
-                        memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
-                               &location, sizeof(__le64));
-                }
-                n = dm_block_data(shadow_current(s));
-                if (le32_to_cpu(n->header.flags) & LEAF_NODE)
-                        return do_leaf(n, key, index);
-                r = rebalance_children(s, info, key);
-                if (r)
-                        break;
-                n = dm_block_data(shadow_current(s));
-                if (le32_to_cpu(n->header.flags) & LEAF_NODE)
-                        return do_leaf(n, key, index);
-                i = lower_bound(n, key);
-                /*
-                 * We know the key is present, or else
-                 * rebalance_children would have returned
-                 * -ENODATA
-                 */
-                root = value64(n, i);
-        }
-        return r;
-}
-int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
-                    uint64_t *keys, dm_block_t *new_root)
-{
-        unsigned level, last_level = info->levels - 1;
-        int index = 0, r = 0;
-        struct shadow_spine spine;
-        struct btree_node *n;
-        init_shadow_spine(&spine, info);
-        for (level = 0; level < info->levels; level++) {
-                r = remove_raw(&spine, info,
-                               (level == last_level ?
-                                &info->value_type : &le64_type),
-                               root, keys[level], (unsigned *)&index);
-                if (r < 0)
-                        break;
-                n = dm_block_data(shadow_current(&spine));
-                if (level != last_level) {
-                        root = value64(n, index);
-                        continue;
-                }
-                BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
-                if (info->value_type.dec)
-                        info->value_type.dec(info->value_type.context,
-                                             value_ptr(n, index));
-                delete_at(n, index);
-        }
-        *new_root = shadow_root(&spine);
-        exit_shadow_spine(&spine);
-        return r;
-}
-EXPORT_SYMBOL_GPL(dm_btree_remove);
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
deleted file mode 100644
index f199a0c4ed0..00000000000
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#include "dm-btree-internal.h"
-#include "dm-transaction-manager.h"
-#include <linux/device-mapper.h>
-#define DM_MSG_PREFIX "btree spine"
-/*----------------------------------------------------------------*/
-#define BTREE_CSUM_XOR 121107
-static int node_check(struct dm_block_validator *v,
-                      struct dm_block *b,
-                      size_t block_size);
-static void node_prepare_for_write(struct dm_block_validator *v,
-                                   struct dm_block *b,
-                                   size_t block_size)
-{
-        struct btree_node *n = dm_block_data(b);
-        struct node_header *h = &n->header;
-        h->blocknr = cpu_to_le64(dm_block_location(b));
-        h->csum = cpu_to_le32(dm_bm_checksum(&h->flags,
-                                             block_size - sizeof(__le32),
-                                             BTREE_CSUM_XOR));
-        BUG_ON(node_check(v, b, 4096));
-}
-static int node_check(struct dm_block_validator *v,
-                      struct dm_block *b,
-                      size_t block_size)
-{
-        struct btree_node *n = dm_block_data(b);
-        struct node_header *h = &n->header;
-        size_t value_size;
-        __le32 csum_disk;
-        uint32_t flags;
-        if (dm_block_location(b) != le64_to_cpu(h->blocknr)) {
-                DMERR_LIMIT("node_check failed: blocknr %llu != wanted %llu",
-                            le64_to_cpu(h->blocknr), dm_block_location(b));
-                return -ENOTBLK;
-        }
-        csum_disk = cpu_to_le32(dm_bm_checksum(&h->flags,
-                                               block_size - sizeof(__le32),
-                                               BTREE_CSUM_XOR));
-        if (csum_disk != h->csum) {
-                DMERR_LIMIT("node_check failed: csum %u != wanted %u",
-                            le32_to_cpu(csum_disk), le32_to_cpu(h->csum));
-                return -EILSEQ;
-        }
-        value_size = le32_to_cpu(h->value_size);
-        if (sizeof(struct node_header) +
-            (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) {
-                DMERR_LIMIT("node_check failed: max_entries too large");
-                return -EILSEQ;
-        }
-        if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) {
-                DMERR_LIMIT("node_check failed: too many entries");
-                return -EILSEQ;
-        }
-        /*
-         * The node must be either INTERNAL or LEAF.
-         */
-        flags = le32_to_cpu(h->flags);
-        if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) {
-                DMERR_LIMIT("node_check failed: node is neither INTERNAL or LEAF");
-                return -EILSEQ;
-        }
-        return 0;
-}
-struct dm_block_validator btree_node_validator = {
-        .name = "btree_node",
-        .prepare_for_write = node_prepare_for_write,
-        .check = node_check
-};
-/*----------------------------------------------------------------*/
-static int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
-                 struct dm_block **result)
-{
-        return dm_tm_read_lock(info->tm, b, &btree_node_validator, result);
-}
-static int bn_shadow(struct dm_btree_info *info, dm_block_t orig,
-              struct dm_btree_value_type *vt,
-              struct dm_block **result)
-{
-        int r, inc;
-        r = dm_tm_shadow_block(info->tm, orig, &btree_node_validator,
-                               result, &inc);
-        if (!r && inc)
-                inc_children(info->tm, dm_block_data(*result), vt);
-        return r;
-}
-int new_block(struct dm_btree_info *info, struct dm_block **result)
-{
-        return dm_tm_new_block(info->tm, &btree_node_validator, result);
-}
-int unlock_block(struct dm_btree_info *info, struct dm_block *b)
-{
-        return dm_tm_unlock(info->tm, b);
-}
-/*----------------------------------------------------------------*/
-void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info)
-{
-        s->info = info;
-        s->count = 0;
-        s->nodes[0] = NULL;
-        s->nodes[1] = NULL;
-}
-int exit_ro_spine(struct ro_spine *s)
-{
-        int r = 0, i;
-        for (i = 0; i < s->count; i++) {
-                int r2 = unlock_block(s->info, s->nodes[i]);
-                if (r2 < 0)
-                        r = r2;
-        }
-        return r;
-}
-int ro_step(struct ro_spine *s, dm_block_t new_child)
-{
-        int r;
-        if (s->count == 2) {
-                r = unlock_block(s->info, s->nodes[0]);
-                if (r < 0)
-                        return r;
-                s->nodes[0] = s->nodes[1];
-                s->count--;
-        }
-        r = bn_read_lock(s->info, new_child, s->nodes + s->count);
-        if (!r)
-                s->count++;
-        return r;
-}
-struct btree_node *ro_node(struct ro_spine *s)
-{
-        struct dm_block *block;
-        BUG_ON(!s->count);
-        block = s->nodes[s->count - 1];
-        return dm_block_data(block);
-}
-/*----------------------------------------------------------------*/
-void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info)
-{
-        s->info = info;
-        s->count = 0;
-}
-int exit_shadow_spine(struct shadow_spine *s)
-{
-        int r = 0, i;
-        for (i = 0; i < s->count; i++) {
-                int r2 = unlock_block(s->info, s->nodes[i]);
-                if (r2 < 0)
-                        r = r2;
-        }
-        return r;
-}
-int shadow_step(struct shadow_spine *s, dm_block_t b,
-                struct dm_btree_value_type *vt)
-{
-        int r;
-        if (s->count == 2) {
-                r = unlock_block(s->info, s->nodes[0]);
-                if (r < 0)
-                        return r;
-                s->nodes[0] = s->nodes[1];
-                s->count--;
-        }
-        r = bn_shadow(s->info, b, vt, s->nodes + s->count);
-        if (!r) {
-                if (!s->count)
-                        s->root = dm_block_location(s->nodes[0]);
-                s->count++;
-        }
-        return r;
-}
-struct dm_block *shadow_current(struct shadow_spine *s)
-{
-        BUG_ON(!s->count);
-        return s->nodes[s->count - 1];
-}
-struct dm_block *shadow_parent(struct shadow_spine *s)
-{
-        BUG_ON(s->count != 2);
-        return s->count == 2 ? s->nodes[0] : NULL;
-}
-int shadow_has_parent(struct shadow_spine *s)
-{
-        return s->count >= 2;
-}
-int shadow_root(struct shadow_spine *s)
-{
-        return s->root;
-}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
deleted file mode 100644
index 4caf66918cd..00000000000
--- a/drivers/md/persistent-data/dm-btree.c
+++ /dev/null
@@ -1,809 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#include "dm-btree-internal.h"
-#include "dm-space-map.h"
-#include "dm-transaction-manager.h"
-#include <linux/export.h>
-#include <linux/device-mapper.h>
-#define DM_MSG_PREFIX "btree"
-/*----------------------------------------------------------------
- * Array manipulation
- *--------------------------------------------------------------*/
-static void memcpy_disk(void *dest, const void *src, size_t len)
-        __dm_written_to_disk(src)
-{
-        memcpy(dest, src, len);
-        __dm_unbless_for_disk(src);
-}
-static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
-                         unsigned index, void *elt)
-        __dm_written_to_disk(elt)
-{
-        if (index < nr_elts)
-                memmove(base + (elt_size * (index + 1)),
-                        base + (elt_size * index),
-                        (nr_elts - index) * elt_size);
-        memcpy_disk(base + (elt_size * index), elt, elt_size);
-}
-/*----------------------------------------------------------------*/
-/* makes the assumption that no two keys are the same. */
-static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
-{
-        int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
-        while (hi - lo > 1) {
-                int mid = lo + ((hi - lo) / 2);
-                uint64_t mid_key = le64_to_cpu(n->keys[mid]);
-                if (mid_key == key)
-                        return mid;
-                if (mid_key < key)
-                        lo = mid;
-                else
-                        hi = mid;
-        }
-        return want_hi ? hi : lo;
-}
-int lower_bound(struct btree_node *n, uint64_t key)
-{
-        return bsearch(n, key, 0);
-}
-void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
-                  struct dm_btree_value_type *vt)
-{
-        unsigned i;
-        uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
-        if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
-                for (i = 0; i < nr_entries; i++)
-                        dm_tm_inc(tm, value64(n, i));
-        else if (vt->inc)
-                for (i = 0; i < nr_entries; i++)
-                        vt->inc(vt->context, value_ptr(n, i));
-}
-static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
-                      uint64_t key, void *value)
-                      __dm_written_to_disk(value)
-{
-        uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
-        __le64 key_le = cpu_to_le64(key);
-        if (index > nr_entries ||
-            index >= le32_to_cpu(node->header.max_entries)) {
-                DMERR("too many entries in btree node for insert");
-                __dm_unbless_for_disk(value);
-                return -ENOMEM;
-        }
-        __dm_bless_for_disk(&key_le);
-        array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
-        array_insert(value_base(node), value_size, nr_entries, index, value);
-        node->header.nr_entries = cpu_to_le32(nr_entries + 1);
-        return 0;
-}
-/*----------------------------------------------------------------*/
-/*
- * We want 3n entries (for some n).  This works more nicely for repeated
- * insert remove loops than (2n + 1).
- */
-static uint32_t calc_max_entries(size_t value_size, size_t block_size)
-{
-        uint32_t total, n;
-        size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
-        block_size -= sizeof(struct node_header);
-        total = block_size / elt_size;
-        n = total / 3;          /* rounds down */
-        return 3 * n;
-}
-int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
-{
-        int r;
-        struct dm_block *b;
-        struct btree_node *n;
-        size_t block_size;
-        uint32_t max_entries;
-        r = new_block(info, &b);
-        if (r < 0)
-                return r;
-        block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
-        max_entries = calc_max_entries(info->value_type.size, block_size);
-        n = dm_block_data(b);
-        memset(n, 0, block_size);
-        n->header.flags = cpu_to_le32(LEAF_NODE);
-        n->header.nr_entries = cpu_to_le32(0);
-        n->header.max_entries = cpu_to_le32(max_entries);
-        n->header.value_size = cpu_to_le32(info->value_type.size);
-        *root = dm_block_location(b);
-        return unlock_block(info, b);
-}
-EXPORT_SYMBOL_GPL(dm_btree_empty);
-/*----------------------------------------------------------------*/
-/*
- * Deletion uses a recursive algorithm, since we have limited stack space
- * we explicitly manage our own stack on the heap.
- */
-#define MAX_SPINE_DEPTH 64
-struct frame {
-        struct dm_block *b;
-        struct btree_node *n;
-        unsigned level;
-        unsigned nr_children;
-        unsigned current_child;
-};
-struct del_stack {
-        struct dm_transaction_manager *tm;
-        int top;
-        struct frame spine[MAX_SPINE_DEPTH];
-};
-static int top_frame(struct del_stack *s, struct frame **f)
-{
-        if (s->top < 0) {
-                DMERR("btree deletion stack empty");
-                return -EINVAL;
-        }
-        *f = s->spine + s->top;
-        return 0;
-}
-static int unprocessed_frames(struct del_stack *s)
-{
-        return s->top >= 0;
-}
-static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
-{
-        int r;
-        uint32_t ref_count;
-        if (s->top >= MAX_SPINE_DEPTH - 1) {
-                DMERR("btree deletion stack out of memory");
-                return -ENOMEM;
-        }
-        r = dm_tm_ref(s->tm, b, &ref_count);
-        if (r)
-                return r;
-        if (ref_count > 1)
-                /*
-                 * This is a shared node, so we can just decrement it's
-                 * reference counter and leave the children.
-                 */
-                dm_tm_dec(s->tm, b);
-        else {
-                struct frame *f = s->spine + ++s->top;
-                r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
-                if (r) {
-                        s->top--;
-                        return r;
-                }
-                f->n = dm_block_data(f->b);
-                f->level = level;
-                f->nr_children = le32_to_cpu(f->n->header.nr_entries);
-                f->current_child = 0;
-        }
-        return 0;
-}
-static void pop_frame(struct del_stack *s)
-{
-        struct frame *f = s->spine + s->top--;
-        dm_tm_dec(s->tm, dm_block_location(f->b));
-        dm_tm_unlock(s->tm, f->b);
-}
-static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
-{
-        return f->level < (info->levels - 1);
-}
-int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
-{
-        int r;
-        struct del_stack *s;
-        s = kmalloc(sizeof(*s), GFP_KERNEL);
-        if (!s)
-                return -ENOMEM;
-        s->tm = info->tm;
-        s->top = -1;
-        r = push_frame(s, root, 0);
-        if (r)
-                goto out;
-        while (unprocessed_frames(s)) {
-                uint32_t flags;
-                struct frame *f;
-                dm_block_t b;
-                r = top_frame(s, &f);
-                if (r)
-                        goto out;
-                if (f->current_child >= f->nr_children) {
-                        pop_frame(s);
-                        continue;
-                }
-                flags = le32_to_cpu(f->n->header.flags);
-                if (flags & INTERNAL_NODE) {
-                        b = value64(f->n, f->current_child);
-                        f->current_child++;
-                        r = push_frame(s, b, f->level);
-                        if (r)
-                                goto out;
-                } else if (is_internal_level(info, f)) {
-                        b = value64(f->n, f->current_child);
-                        f->current_child++;
-                        r = push_frame(s, b, f->level + 1);
-                        if (r)
-                                goto out;
-                } else {
-                        if (info->value_type.dec) {
-                                unsigned i;
-                                for (i = 0; i < f->nr_children; i++)
-                                        info->value_type.dec(info->value_type.context,
-                                                             value_ptr(f->n, i));
-                        }
-                        f->current_child = f->nr_children;
-                }
-        }
-out:
-        kfree(s);
-        return r;
-}
-EXPORT_SYMBOL_GPL(dm_btree_del);
-/*----------------------------------------------------------------*/
-static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
-                            int (*search_fn)(struct btree_node *, uint64_t),
-                            uint64_t *result_key, void *v, size_t value_size)
-{
-        int i, r;
-        uint32_t flags, nr_entries;
-        do {
-                r = ro_step(s, block);
-                if (r < 0)
-                        return r;
-                i = search_fn(ro_node(s), key);
-                flags = le32_to_cpu(ro_node(s)->header.flags);
-                nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
-                if (i < 0 || i >= nr_entries)
-                        return -ENODATA;
-                if (flags & INTERNAL_NODE)
-                        block = value64(ro_node(s), i);
-        } while (!(flags & LEAF_NODE));
-        *result_key = le64_to_cpu(ro_node(s)->keys[i]);
-        memcpy(v, value_ptr(ro_node(s), i), value_size);
-        return 0;
-}
-int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
-                    uint64_t *keys, void *value_le)
-{
-        unsigned level, last_level = info->levels - 1;
-        int r = -ENODATA;
-        uint64_t rkey;
-        __le64 internal_value_le;
-        struct ro_spine spine;
-        init_ro_spine(&spine, info);
-        for (level = 0; level < info->levels; level++) {
-                size_t size;
-                void *value_p;
-                if (level == last_level) {
-                        value_p = value_le;
-                        size = info->value_type.size;
-                } else {
-                        value_p = &internal_value_le;
-                        size = sizeof(uint64_t);
-                }
-                r = btree_lookup_raw(&spine, root, keys[level],
-                                     lower_bound, &rkey,
-                                     value_p, size);
-                if (!r) {
-                        if (rkey != keys[level]) {
-                                exit_ro_spine(&spine);
-                                return -ENODATA;
-                        }
-                } else {
-                        exit_ro_spine(&spine);
-                        return r;
-                }
-                root = le64_to_cpu(internal_value_le);
-        }
-        exit_ro_spine(&spine);
-        return r;
-}
-EXPORT_SYMBOL_GPL(dm_btree_lookup);
-/*
- * Splits a node by creating a sibling node and shifting half the nodes
- * contents across.  Assumes there is a parent node, and it has room for
- * another child.
- *
- * Before:
- *        +--------+
- *        | Parent |
- *        +--------+
- *           |
- *           v
- *      +----------+
- *      | A ++++++ |
- *      +----------+
- *
- *
- * After:
- *              +--------+
- *              | Parent |
- *              +--------+
- *                |     |
- *                v     +------+
- *          +---------+        |
- *          | A* +++  |        v
- *          +---------+   +-------+
- *                        | B +++ |
- *                        +-------+
- *
- * Where A* is a shadow of A.
- */
-static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
-                               unsigned parent_index, uint64_t key)
-{
-        int r;
-        size_t size;
-        unsigned nr_left, nr_right;
-        struct dm_block *left, *right, *parent;
-        struct btree_node *ln, *rn, *pn;
-        __le64 location;
-        left = shadow_current(s);
-        r = new_block(s->info, &right);
-        if (r < 0)
-                return r;
-        ln = dm_block_data(left);
-        rn = dm_block_data(right);
-        nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
-        nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
-        ln->header.nr_entries = cpu_to_le32(nr_left);
-        rn->header.flags = ln->header.flags;
-        rn->header.nr_entries = cpu_to_le32(nr_right);
-        rn->header.max_entries = ln->header.max_entries;
-        rn->header.value_size = ln->header.value_size;
-        memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
-        size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
-                sizeof(uint64_t) : s->info->value_type.size;
-        memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
-               size * nr_right);
-        /*
-         * Patch up the parent
-         */
-        parent = shadow_parent(s);
-        pn = dm_block_data(parent);
-        location = cpu_to_le64(dm_block_location(left));
-        __dm_bless_for_disk(&location);
-        memcpy_disk(value_ptr(pn, parent_index),
-                    &location, sizeof(__le64));
-        location = cpu_to_le64(dm_block_location(right));
-        __dm_bless_for_disk(&location);
-        r = insert_at(sizeof(__le64), pn, parent_index + 1,
-                      le64_to_cpu(rn->keys[0]), &location);
-        if (r)
-                return r;
-        if (key < le64_to_cpu(rn->keys[0])) {
-                unlock_block(s->info, right);
-                s->nodes[1] = left;
-        } else {
-                unlock_block(s->info, left);
-                s->nodes[1] = right;
-        }
-        return 0;
-}
-/*
- * Splits a node by creating two new children beneath the given node.
- *
- * Before:
- *        +----------+
- *        | A ++++++ |
- *        +----------+
- *
- *
- * After:
- *      +------------+
- *      | A (shadow) |
- *      +------------+
- *          |   |
- *   +------+   +----+
- *   |               |
- *   v               v
- * +-------+     +-------+
- * | B +++ |     | C +++ |
- * +-------+     +-------+
- */
-static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
-{
-        int r;
-        size_t size;
-        unsigned nr_left, nr_right;
-        struct dm_block *left, *right, *new_parent;
-        struct btree_node *pn, *ln, *rn;
-        __le64 val;
-        new_parent = shadow_current(s);
-        r = new_block(s->info, &left);
-        if (r < 0)
-                return r;
-        r = new_block(s->info, &right);
-        if (r < 0) {
-                /* FIXME: put left */
-                return r;
-        }
-        pn = dm_block_data(new_parent);
-        ln = dm_block_data(left);
-        rn = dm_block_data(right);
-        nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
-        nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
-        ln->header.flags = pn->header.flags;
-        ln->header.nr_entries = cpu_to_le32(nr_left);
-        ln->header.max_entries = pn->header.max_entries;
-        ln->header.value_size = pn->header.value_size;
-        rn->header.flags = pn->header.flags;
-        rn->header.nr_entries = cpu_to_le32(nr_right);
-        rn->header.max_entries = pn->header.max_entries;
-        rn->header.value_size = pn->header.value_size;
-        memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
-        memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
-        size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
-                sizeof(__le64) : s->info->value_type.size;
-        memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
-        memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
-               nr_right * size);
-        /* new_parent should just point to l and r now */
-        pn->header.flags = cpu_to_le32(INTERNAL_NODE);
-        pn->header.nr_entries = cpu_to_le32(2);
-        pn->header.max_entries = cpu_to_le32(
-                calc_max_entries(sizeof(__le64),
-                                 dm_bm_block_size(
-                                         dm_tm_get_bm(s->info->tm))));
-        pn->header.value_size = cpu_to_le32(sizeof(__le64));
-        val = cpu_to_le64(dm_block_location(left));
-        __dm_bless_for_disk(&val);
-        pn->keys[0] = ln->keys[0];
-        memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
-        val = cpu_to_le64(dm_block_location(right));
-        __dm_bless_for_disk(&val);
-        pn->keys[1] = rn->keys[0];
-        memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
-        /*
-         * rejig the spine.  This is ugly, since it knows too
-         * much about the spine
-         */
-        if (s->nodes[0] != new_parent) {
-                unlock_block(s->info, s->nodes[0]);
-                s->nodes[0] = new_parent;
-        }
-        if (key < le64_to_cpu(rn->keys[0])) {
-                unlock_block(s->info, right);
-                s->nodes[1] = left;
-        } else {
-                unlock_block(s->info, left);
-                s->nodes[1] = right;
-        }
-        s->count = 2;
-        return 0;
-}
-static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
-                            struct dm_btree_value_type *vt,
-                            uint64_t key, unsigned *index)
-{
-        int r, i = *index, top = 1;
-        struct btree_node *node;
-        for (;;) {
-                r = shadow_step(s, root, vt);
-                if (r < 0)
-                        return r;
-                node = dm_block_data(shadow_current(s));
-                /*
-                 * We have to patch up the parent node, ugly, but I don't
-                 * see a way to do this automatically as part of the spine
-                 * op.
-                 */
-                if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
-                        __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
-                        __dm_bless_for_disk(&location);
-                        memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
-                                    &location, sizeof(__le64));
-                }
-                node = dm_block_data(shadow_current(s));
-                if (node->header.nr_entries == node->header.max_entries) {
-                        if (top)
-                                r = btree_split_beneath(s, key);
-                        else
-                                r = btree_split_sibling(s, root, i, key);
-                        if (r < 0)
-                                return r;
-                }
-                node = dm_block_data(shadow_current(s));
-                i = lower_bound(node, key);
-                if (le32_to_cpu(node->header.flags) & LEAF_NODE)
-                        break;
-                if (i < 0) {
-                        /* change the bounds on the lowest key */
-                        node->keys[0] = cpu_to_le64(key);
-                        i = 0;
-                }
-                root = value64(node, i);
-                top = 0;
-        }
-        if (i < 0 || le64_to_cpu(node->keys[i]) != key)
-                i++;
-        *index = i;
-        return 0;
-}
-static int insert(struct dm_btree_info *info, dm_block_t root,
-                  uint64_t *keys, void *value, dm_block_t *new_root,
-                  int *inserted)
-                  __dm_written_to_disk(value)
-{
-        int r, need_insert;
-        unsigned level, index = -1, last_level = info->levels - 1;
-        dm_block_t block = root;
-        struct shadow_spine spine;
-        struct btree_node *n;
-        struct dm_btree_value_type le64_type;
-        le64_type.context = NULL;
-        le64_type.size = sizeof(__le64);
-        le64_type.inc = NULL;
-        le64_type.dec = NULL;
-        le64_type.equal = NULL;
-        init_shadow_spine(&spine, info);
-        for (level = 0; level < (info->levels - 1); level++) {
-                r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
-                if (r < 0)
-                        goto bad;
-                n = dm_block_data(shadow_current(&spine));
-                need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
-                               (le64_to_cpu(n->keys[index]) != keys[level]));
-                if (need_insert) {
-                        dm_block_t new_tree;
-                        __le64 new_le;
-                        r = dm_btree_empty(info, &new_tree);
-                        if (r < 0)
-                                goto bad;
-                        new_le = cpu_to_le64(new_tree);
-                        __dm_bless_for_disk(&new_le);
-                        r = insert_at(sizeof(uint64_t), n, index,
-                                      keys[level], &new_le);
-                        if (r)
-                                goto bad;
-                }
-                if (level < last_level)
-                        block = value64(n, index);
-        }
-        r = btree_insert_raw(&spine, block, &info->value_type,
-                             keys[level], &index);
-        if (r < 0)
-                goto bad;
-        n = dm_block_data(shadow_current(&spine));
-        need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
-                       (le64_to_cpu(n->keys[index]) != keys[level]));
-        if (need_insert) {
-                if (inserted)
-                        *inserted = 1;
-                r = insert_at(info->value_type.size, n, index,
-                              keys[level], value);
-                if (r)
-                        goto bad_unblessed;
-        } else {
-                if (inserted)
-                        *inserted = 0;
-                if (info->value_type.dec &&
-                    (!info->value_type.equal ||
-                     !info->value_type.equal(
-                             info->value_type.context,
-                             value_ptr(n, index),
-                             value))) {
-                        info->value_type.dec(info->value_type.context,
-                                             value_ptr(n, index));
-                }
-                memcpy_disk(value_ptr(n, index),
-                            value, info->value_type.size);
-        }
-        *new_root = shadow_root(&spine);
-        exit_shadow_spine(&spine);
-        return 0;
-bad:
-        __dm_unbless_for_disk(value);
-bad_unblessed:
-        exit_shadow_spine(&spine);
-        return r;
-}
-int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
-                    uint64_t *keys, void *value, dm_block_t *new_root)
-                    __dm_written_to_disk(value)
-{
-        return insert(info, root, keys, value, new_root, NULL);
-}
-EXPORT_SYMBOL_GPL(dm_btree_insert);
-int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
-                           uint64_t *keys, void *value, dm_block_t *new_root,
-                           int *inserted)
-                           __dm_written_to_disk(value)
-{
-        return insert(info, root, keys, value, new_root, inserted);
-}
-EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
-/*----------------------------------------------------------------*/
-static int find_highest_key(struct ro_spine *s, dm_block_t block,
-                            uint64_t *result_key, dm_block_t *next_block)
-{
-        int i, r;
-        uint32_t flags;
-        do {
-                r = ro_step(s, block);
-                if (r < 0)
-                        return r;
-                flags = le32_to_cpu(ro_node(s)->header.flags);
-                i = le32_to_cpu(ro_node(s)->header.nr_entries);
-                if (!i)
-                        return -ENODATA;
-                else
-                        i--;
-                *result_key = le64_to_cpu(ro_node(s)->keys[i]);
-                if (next_block || flags & INTERNAL_NODE)
-                        block = value64(ro_node(s), i);
-        } while (flags & INTERNAL_NODE);
-        if (next_block)
-                *next_block = block;
-        return 0;
-}
-int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
-                              uint64_t *result_keys)
-{
-        int r = 0, count = 0, level;
-        struct ro_spine spine;
-        init_ro_spine(&spine, info);
-        for (level = 0; level < info->levels; level++) {
-                r = find_highest_key(&spine, root, result_keys + level,
-                                     level == info->levels - 1 ? NULL : &root);
-                if (r == -ENODATA) {
-                        r = 0;
-                        break;
-                } else if (r)
-                        break;
-                count++;
-        }
-        exit_ro_spine(&spine);
-        return r ? r : count;
-}
-EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
deleted file mode 100644
index a2cd50441ca..00000000000
--- a/drivers/md/persistent-data/dm-btree.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#ifndef _LINUX_DM_BTREE_H
-#define _LINUX_DM_BTREE_H
-#include "dm-block-manager.h"
-struct dm_transaction_manager;
-/*----------------------------------------------------------------*/
-/*
- * Annotations used to check on-disk metadata is handled as little-endian.
- */
-#ifdef __CHECKER__
-#  define __dm_written_to_disk(x) __releases(x)
-#  define __dm_reads_from_disk(x) __acquires(x)
-#  define __dm_bless_for_disk(x) __acquire(x)
-#  define __dm_unbless_for_disk(x) __release(x)
-#else
-#  define __dm_written_to_disk(x)
-#  define __dm_reads_from_disk(x)
-#  define __dm_bless_for_disk(x)
-#  define __dm_unbless_for_disk(x)
-#endif
-/*----------------------------------------------------------------*/
-/*
- * Manipulates hierarchical B+ trees with 64-bit keys and arbitrary-sized
- * values.
- */
-/*
- * Information about the values stored within the btree.
- */
-struct dm_btree_value_type {
-        void *context;
-        /*
-         * The size in bytes of each value.
-         */
-        uint32_t size;
-        /*
-         * Any of these methods can be safely set to NULL if you do not
-         * need the corresponding feature.
-         */
-        /*
-         * The btree is making a duplicate of the value, for instance
-         * because previously-shared btree nodes have now diverged.
-         * @value argument is the new copy that the copy function may modify.
-         * (Probably it just wants to increment a reference count
-         * somewhere.) This method is _not_ called for insertion of a new
-         * value: It is assumed the ref count is already 1.
-         */
-        void (*inc)(void *context, void *value);
-        /*
-         * This value is being deleted.  The btree takes care of freeing
-         * the memory pointed to by @value.  Often the del function just
-         * needs to decrement a reference count somewhere.
-         */
-        void (*dec)(void *context, void *value);
-        /*
-         * A test for equality between two values.  When a value is
-         * overwritten with a new one, the old one has the dec method
-         * called _unless_ the new and old value are deemed equal.
-         */
-        int (*equal)(void *context, void *value1, void *value2);
-};
-/*
- * The shape and contents of a btree.
- */
-struct dm_btree_info {
-        struct dm_transaction_manager *tm;
-        /*
-         * Number of nested btrees. (Not the depth of a single tree.)
-         */
-        unsigned levels;
-        struct dm_btree_value_type value_type;
-};
-/*
- * Set up an empty tree.  O(1).
- */
-int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root);
-/*
- * Delete a tree.  O(n) - this is the slow one!  It can also block, so
- * please don't call it on an IO path.
- */
-int dm_btree_del(struct dm_btree_info *info, dm_block_t root);
-/*
- * All the lookup functions return -ENODATA if the key cannot be found.
- */
-/*
- * Tries to find a key that matches exactly.  O(ln(n))
- */
-int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
-                    uint64_t *keys, void *value_le);
-/*
- * Insertion (or overwrite an existing value).  O(ln(n))
- */
-int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
-                    uint64_t *keys, void *value, dm_block_t *new_root)
-                    __dm_written_to_disk(value);
-/*
- * A variant of insert that indicates whether it actually inserted or just
- * overwrote.  Useful if you're keeping track of the number of entries in a
- * tree.
- */
-int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
-                           uint64_t *keys, void *value, dm_block_t *new_root,
-                           int *inserted)
-                           __dm_written_to_disk(value);
-/*
- * Remove a key if present.  This doesn't remove empty sub trees.  Normally
- * subtrees represent a separate entity, like a snapshot map, so this is
- * correct behaviour.  O(ln(n)).
- */
-int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
-                    uint64_t *keys, dm_block_t *new_root);
-/*
- * Returns < 0 on failure.  Otherwise the number of key entries that have
- * been filled out.  Remember trees can have zero entries, and as such have
- * no highest key.
- */
-int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
-                              uint64_t *result_keys);
-#endif  /* _LINUX_DM_BTREE_H */
diff --git a/drivers/md/persistent-data/dm-persistent-data-internal.h b/drivers/md/persistent-data/dm-persistent-data-internal.h
deleted file mode 100644
index c49e26fff36..00000000000
--- a/drivers/md/persistent-data/dm-persistent-data-internal.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#ifndef _DM_PERSISTENT_DATA_INTERNAL_H
-#define _DM_PERSISTENT_DATA_INTERNAL_H
-#include "dm-block-manager.h"
-static inline unsigned dm_hash_block(dm_block_t b, unsigned hash_mask)
-{
-        const unsigned BIG_PRIME = 4294967291UL;
-        return (((unsigned) b) * BIG_PRIME) & hash_mask;
-}
-#endif  /* _PERSISTENT_DATA_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
deleted file mode 100644
index 3e7a88d99eb..00000000000
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ /dev/null
@@ -1,712 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#include "dm-space-map-common.h"
-#include "dm-transaction-manager.h"
-#include <linux/bitops.h>
-#include <linux/device-mapper.h>
-#define DM_MSG_PREFIX "space map common"
-/*----------------------------------------------------------------*/
-/*
- * Index validator.
- */
-#define INDEX_CSUM_XOR 160478
-static void index_prepare_for_write(struct dm_block_validator *v,
-                                    struct dm_block *b,
-                                    size_t block_size)
-{
-        struct disk_metadata_index *mi_le = dm_block_data(b);
-        mi_le->blocknr = cpu_to_le64(dm_block_location(b));
-        mi_le->csum = cpu_to_le32(dm_bm_checksum(&mi_le->padding,
-                                                 block_size - sizeof(__le32),
-                                                 INDEX_CSUM_XOR));
-}
-static int index_check(struct dm_block_validator *v,
-                       struct dm_block *b,
-                       size_t block_size)
-{
-        struct disk_metadata_index *mi_le = dm_block_data(b);
-        __le32 csum_disk;
-        if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) {
-                DMERR_LIMIT("index_check failed: blocknr %llu != wanted %llu",
-                            le64_to_cpu(mi_le->blocknr), dm_block_location(b));
-                return -ENOTBLK;
-        }
-        csum_disk = cpu_to_le32(dm_bm_checksum(&mi_le->padding,
-                                               block_size - sizeof(__le32),
-                                               INDEX_CSUM_XOR));
-        if (csum_disk != mi_le->csum) {
-                DMERR_LIMIT("index_check failed: csum %u != wanted %u",
-                            le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
-                return -EILSEQ;
-        }
-        return 0;
-}
-static struct dm_block_validator index_validator = {
-        .name = "index",
-        .prepare_for_write = index_prepare_for_write,
-        .check = index_check
-};
-/*----------------------------------------------------------------*/
-/*
- * Bitmap validator
- */
-#define BITMAP_CSUM_XOR 240779
-static void bitmap_prepare_for_write(struct dm_block_validator *v,
-                                     struct dm_block *b,
-                                     size_t block_size)
-{
-        struct disk_bitmap_header *disk_header = dm_block_data(b);
-        disk_header->blocknr = cpu_to_le64(dm_block_location(b));
-        disk_header->csum = cpu_to_le32(dm_bm_checksum(&disk_header->not_used,
-                                                       block_size - sizeof(__le32),
-                                                       BITMAP_CSUM_XOR));
-}
-static int bitmap_check(struct dm_block_validator *v,
-                        struct dm_block *b,
-                        size_t block_size)
-{
-        struct disk_bitmap_header *disk_header = dm_block_data(b);
-        __le32 csum_disk;
-        if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) {
-                DMERR_LIMIT("bitmap check failed: blocknr %llu != wanted %llu",
-                            le64_to_cpu(disk_header->blocknr), dm_block_location(b));
-                return -ENOTBLK;
-        }
-        csum_disk = cpu_to_le32(dm_bm_checksum(&disk_header->not_used,
-                                               block_size - sizeof(__le32),
-                                               BITMAP_CSUM_XOR));
-        if (csum_disk != disk_header->csum) {
-                DMERR_LIMIT("bitmap check failed: csum %u != wanted %u",
-                            le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
-                return -EILSEQ;
-        }
-        return 0;
-}
-static struct dm_block_validator dm_sm_bitmap_validator = {
-        .name = "sm_bitmap",
-        .prepare_for_write = bitmap_prepare_for_write,
-        .check = bitmap_check
-};
-/*----------------------------------------------------------------*/
-#define ENTRIES_PER_WORD 32
-#define ENTRIES_SHIFT   5
-static void *dm_bitmap_data(struct dm_block *b)
-{
-        return dm_block_data(b) + sizeof(struct disk_bitmap_header);
-}
-#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL
-static unsigned bitmap_word_used(void *addr, unsigned b)
-{
-        __le64 *words_le = addr;
-        __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
-        uint64_t bits = le64_to_cpu(*w_le);
-        uint64_t mask = (bits + WORD_MASK_HIGH + 1) & WORD_MASK_HIGH;
-        return !(~bits & mask);
-}
-static unsigned sm_lookup_bitmap(void *addr, unsigned b)
-{
-        __le64 *words_le = addr;
-        __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
-        unsigned hi, lo;
-        b = (b & (ENTRIES_PER_WORD - 1)) << 1;
-        hi = !!test_bit_le(b, (void *) w_le);
-        lo = !!test_bit_le(b + 1, (void *) w_le);
-        return (hi << 1) | lo;
-}
-static void sm_set_bitmap(void *addr, unsigned b, unsigned val)
-{
-        __le64 *words_le = addr;
-        __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
-        b = (b & (ENTRIES_PER_WORD - 1)) << 1;
-        if (val & 2)
-                __set_bit_le(b, (void *) w_le);
-        else
-                __clear_bit_le(b, (void *) w_le);
-        if (val & 1)
-                __set_bit_le(b + 1, (void *) w_le);
-        else
-                __clear_bit_le(b + 1, (void *) w_le);
-}
-static int sm_find_free(void *addr, unsigned begin, unsigned end,
-                        unsigned *result)
-{
-        while (begin < end) {
-                if (!(begin & (ENTRIES_PER_WORD - 1)) &&
-                    bitmap_word_used(addr, begin)) {
-                        begin += ENTRIES_PER_WORD;
-                        continue;
-                }
-                if (!sm_lookup_bitmap(addr, begin)) {
-                        *result = begin;
-                        return 0;
-                }
-                begin++;
-        }
-        return -ENOSPC;
-}
-/*----------------------------------------------------------------*/
-static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
-{
-        ll->tm = tm;
-        ll->bitmap_info.tm = tm;
-        ll->bitmap_info.levels = 1;
-        /*
-         * Because the new bitmap blocks are created via a shadow
-         * operation, the old entry has already had its reference count
-         * decremented and we don't need the btree to do any bookkeeping.
-         */
-        ll->bitmap_info.value_type.size = sizeof(struct disk_index_entry);
-        ll->bitmap_info.value_type.inc = NULL;
-        ll->bitmap_info.value_type.dec = NULL;
-        ll->bitmap_info.value_type.equal = NULL;
-        ll->ref_count_info.tm = tm;
-        ll->ref_count_info.levels = 1;
-        ll->ref_count_info.value_type.size = sizeof(uint32_t);
-        ll->ref_count_info.value_type.inc = NULL;
-        ll->ref_count_info.value_type.dec = NULL;
-        ll->ref_count_info.value_type.equal = NULL;
-        ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm));
-        if (ll->block_size > (1 << 30)) {
-                DMERR("block size too big to hold bitmaps");
-                return -EINVAL;
-        }
-        ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) *
-                ENTRIES_PER_BYTE;
-        ll->nr_blocks = 0;
-        ll->bitmap_root = 0;
-        ll->ref_count_root = 0;
-        ll->bitmap_index_changed = false;
-        return 0;
-}
-int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
-{
-        int r;
-        dm_block_t i, nr_blocks, nr_indexes;
-        unsigned old_blocks, blocks;
-        nr_blocks = ll->nr_blocks + extra_blocks;
-        old_blocks = dm_sector_div_up(ll->nr_blocks, ll->entries_per_block);
-        blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block);
-        nr_indexes = dm_sector_div_up(nr_blocks, ll->entries_per_block);
-        if (nr_indexes > ll->max_entries(ll)) {
-                DMERR("space map too large");
-                return -EINVAL;
-        }
-        for (i = old_blocks; i < blocks; i++) {
-                struct dm_block *b;
-                struct disk_index_entry idx;
-                r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
-                if (r < 0)
-                        return r;
-                idx.blocknr = cpu_to_le64(dm_block_location(b));
-                r = dm_tm_unlock(ll->tm, b);
-                if (r < 0)
-                        return r;
-                idx.nr_free = cpu_to_le32(ll->entries_per_block);
-                idx.none_free_before = 0;
-                r = ll->save_ie(ll, i, &idx);
-                if (r < 0)
-                        return r;
-        }
-        ll->nr_blocks = nr_blocks;
-        return 0;
-}
-int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
-{
-        int r;
-        dm_block_t index = b;
-        struct disk_index_entry ie_disk;
-        struct dm_block *blk;
-        b = do_div(index, ll->entries_per_block);
-        r = ll->load_ie(ll, index, &ie_disk);
-        if (r < 0)
-                return r;
-        r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr),
-                            &dm_sm_bitmap_validator, &blk);
-        if (r < 0)
-                return r;
-        *result = sm_lookup_bitmap(dm_bitmap_data(blk), b);
-        return dm_tm_unlock(ll->tm, blk);
-}
-int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
-{
-        __le32 le_rc;
-        int r = sm_ll_lookup_bitmap(ll, b, result);
-        if (r)
-                return r;
-        if (*result != 3)
-                return r;
-        r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc);
-        if (r < 0)
-                return r;
-        *result = le32_to_cpu(le_rc);
-        return r;
-}
-int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
-                          dm_block_t end, dm_block_t *result)
-{
-        int r;
-        struct disk_index_entry ie_disk;
-        dm_block_t i, index_begin = begin;
-        dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block);
-        /*
-         * FIXME: Use shifts
-         */
-        begin = do_div(index_begin, ll->entries_per_block);
-        end = do_div(end, ll->entries_per_block);
-        for (i = index_begin; i < index_end; i++, begin = 0) {
-                struct dm_block *blk;
-                unsigned position;
-                uint32_t bit_end;
-                r = ll->load_ie(ll, i, &ie_disk);
-                if (r < 0)
-                        return r;
-                if (le32_to_cpu(ie_disk.nr_free) == 0)
-                        continue;
-                r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr),
-                                    &dm_sm_bitmap_validator, &blk);
-                if (r < 0)
-                        return r;
-                bit_end = (i == index_end - 1) ?  end : ll->entries_per_block;
-                r = sm_find_free(dm_bitmap_data(blk),
-                                 max_t(unsigned, begin, le32_to_cpu(ie_disk.none_free_before)),
-                                 bit_end, &position);
-                if (r == -ENOSPC) {
-                        /*
-                         * This might happen because we started searching
-                         * part way through the bitmap.
-                         */
-                        dm_tm_unlock(ll->tm, blk);
-                        continue;
-                } else if (r < 0) {
-                        dm_tm_unlock(ll->tm, blk);
-                        return r;
-                }
-                r = dm_tm_unlock(ll->tm, blk);
-                if (r < 0)
-                        return r;
-                *result = i * ll->entries_per_block + (dm_block_t) position;
-                return 0;
-        }
-        return -ENOSPC;
-}
-int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
-                 uint32_t ref_count, enum allocation_event *ev)
-{
-        int r;
-        uint32_t bit, old;
-        struct dm_block *nb;
-        dm_block_t index = b;
-        struct disk_index_entry ie_disk;
-        void *bm_le;
-        int inc;
-        bit = do_div(index, ll->entries_per_block);
-        r = ll->load_ie(ll, index, &ie_disk);
-        if (r < 0)
-                return r;
-        r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk.blocknr),
-                               &dm_sm_bitmap_validator, &nb, &inc);
-        if (r < 0) {
-                DMERR("dm_tm_shadow_block() failed");
-                return r;
-        }
-        ie_disk.blocknr = cpu_to_le64(dm_block_location(nb));
-        bm_le = dm_bitmap_data(nb);
-        old = sm_lookup_bitmap(bm_le, bit);
-        if (ref_count <= 2) {
-                sm_set_bitmap(bm_le, bit, ref_count);
-                r = dm_tm_unlock(ll->tm, nb);
-                if (r < 0)
-                        return r;
-                if (old > 2) {
-                        r = dm_btree_remove(&ll->ref_count_info,
-                                            ll->ref_count_root,
-                                            &b, &ll->ref_count_root);
-                        if (r)
-                                return r;
-                }
-        } else {
-                __le32 le_rc = cpu_to_le32(ref_count);
-                sm_set_bitmap(bm_le, bit, 3);
-                r = dm_tm_unlock(ll->tm, nb);
-                if (r < 0)
-                        return r;
-                __dm_bless_for_disk(&le_rc);
-                r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root,
-                                    &b, &le_rc, &ll->ref_count_root);
-                if (r < 0) {
-                        DMERR("ref count insert failed");
-                        return r;
-                }
-        }
-        if (ref_count && !old) {
-                *ev = SM_ALLOC;
-                ll->nr_allocated++;
-                le32_add_cpu(&ie_disk.nr_free, -1);
-                if (le32_to_cpu(ie_disk.none_free_before) == bit)
-                        ie_disk.none_free_before = cpu_to_le32(bit + 1);
-        } else if (old && !ref_count) {
-                *ev = SM_FREE;
-                ll->nr_allocated--;
-                le32_add_cpu(&ie_disk.nr_free, 1);
-                ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit));
-        }
-        return ll->save_ie(ll, index, &ie_disk);
-}
-int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
-{
-        int r;
-        uint32_t rc;
-        r = sm_ll_lookup(ll, b, &rc);
-        if (r)
-                return r;
-        return sm_ll_insert(ll, b, rc + 1, ev);
-}
-int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
-{
-        int r;
-        uint32_t rc;
-        r = sm_ll_lookup(ll, b, &rc);
-        if (r)
-                return r;
-        if (!rc)
-                return -EINVAL;
-        return sm_ll_insert(ll, b, rc - 1, ev);
-}
-int sm_ll_commit(struct ll_disk *ll)
-{
-        int r = 0;
-        if (ll->bitmap_index_changed) {
-                r = ll->commit(ll);
-                if (!r)
-                        ll->bitmap_index_changed = false;
-        }
-        return r;
-}
-/*----------------------------------------------------------------*/
-static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index,
-                               struct disk_index_entry *ie)
-{
-        memcpy(ie, ll->mi_le.index + index, sizeof(*ie));
-        return 0;
-}
-static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index,
-                               struct disk_index_entry *ie)
-{
-        ll->bitmap_index_changed = true;
-        memcpy(ll->mi_le.index + index, ie, sizeof(*ie));
-        return 0;
-}
-static int metadata_ll_init_index(struct ll_disk *ll)
-{
-        int r;
-        struct dm_block *b;
-        r = dm_tm_new_block(ll->tm, &index_validator, &b);
-        if (r < 0)
-                return r;
-        memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
-        ll->bitmap_root = dm_block_location(b);
-        return dm_tm_unlock(ll->tm, b);
-}
-static int metadata_ll_open(struct ll_disk *ll)
-{
-        int r;
-        struct dm_block *block;
-        r = dm_tm_read_lock(ll->tm, ll->bitmap_root,
-                            &index_validator, &block);
-        if (r)
-                return r;
-        memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le));
-        return dm_tm_unlock(ll->tm, block);
-}
-static dm_block_t metadata_ll_max_entries(struct ll_disk *ll)
-{
-        return MAX_METADATA_BITMAPS;
-}
-static int metadata_ll_commit(struct ll_disk *ll)
-{
-        int r, inc;
-        struct dm_block *b;
-        r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc);
-        if (r)
-                return r;
-        memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
-        ll->bitmap_root = dm_block_location(b);
-        return dm_tm_unlock(ll->tm, b);
-}
-int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm)
-{
-        int r;
-        r = sm_ll_init(ll, tm);
-        if (r < 0)
-                return r;
-        ll->load_ie = metadata_ll_load_ie;
-        ll->save_ie = metadata_ll_save_ie;
-        ll->init_index = metadata_ll_init_index;
-        ll->open_index = metadata_ll_open;
-        ll->max_entries = metadata_ll_max_entries;
-        ll->commit = metadata_ll_commit;
-        ll->nr_blocks = 0;
-        ll->nr_allocated = 0;
-        r = ll->init_index(ll);
-        if (r < 0)
-                return r;
-        r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root);
-        if (r < 0)
-                return r;
-        return 0;
-}
-int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
-                        void *root_le, size_t len)
-{
-        int r;
-        struct disk_sm_root *smr = root_le;
-        if (len < sizeof(struct disk_sm_root)) {
-                DMERR("sm_metadata root too small");
-                return -ENOMEM;
-        }
-        r = sm_ll_init(ll, tm);
-        if (r < 0)
-                return r;
-        ll->load_ie = metadata_ll_load_ie;
-        ll->save_ie = metadata_ll_save_ie;
-        ll->init_index = metadata_ll_init_index;
-        ll->open_index = metadata_ll_open;
-        ll->max_entries = metadata_ll_max_entries;
-        ll->commit = metadata_ll_commit;
-        ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
-        ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
-        ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
-        ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
-        return ll->open_index(ll);
-}
-/*----------------------------------------------------------------*/
-static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index,
-                           struct disk_index_entry *ie)
-{
-        return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie);
-}
-static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index,
-                           struct disk_index_entry *ie)
-{
-        __dm_bless_for_disk(ie);
-        return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root,
-                               &index, ie, &ll->bitmap_root);
-}
-static int disk_ll_init_index(struct ll_disk *ll)
-{
-        return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root);
-}
-static int disk_ll_open(struct ll_disk *ll)
-{
-        /* nothing to do */
-        return 0;
-}
-static dm_block_t disk_ll_max_entries(struct ll_disk *ll)
-{
-        return -1ULL;
-}
-static int disk_ll_commit(struct ll_disk *ll)
-{
-        return 0;
-}
-int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm)
-{
-        int r;
-        r = sm_ll_init(ll, tm);
-        if (r < 0)
-                return r;
-        ll->load_ie = disk_ll_load_ie;
-        ll->save_ie = disk_ll_save_ie;
-        ll->init_index = disk_ll_init_index;
-        ll->open_index = disk_ll_open;
-        ll->max_entries = disk_ll_max_entries;
-        ll->commit = disk_ll_commit;
-        ll->nr_blocks = 0;
-        ll->nr_allocated = 0;
-        r = ll->init_index(ll);
-        if (r < 0)
-                return r;
-        r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root);
-        if (r < 0)
-                return r;
-        return 0;
-}
-int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm,
-                    void *root_le, size_t len)
-{
-        int r;
-        struct disk_sm_root *smr = root_le;
-        if (len < sizeof(struct disk_sm_root)) {
-                DMERR("sm_metadata root too small");
-                return -ENOMEM;
-        }
-        r = sm_ll_init(ll, tm);
-        if (r < 0)
-                return r;
-        ll->load_ie = disk_ll_load_ie;
-        ll->save_ie = disk_ll_save_ie;
-        ll->init_index = disk_ll_init_index;
-        ll->open_index = disk_ll_open;
-        ll->max_entries = disk_ll_max_entries;
-        ll->commit = disk_ll_commit;
-        ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
-        ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
-        ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
-        ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
-        return ll->open_index(ll);
-}
-/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
deleted file mode 100644
index b3078d5eda0..00000000000
--- a/drivers/md/persistent-data/dm-space-map-common.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#ifndef DM_SPACE_MAP_COMMON_H
-#define DM_SPACE_MAP_COMMON_H
-#include "dm-btree.h"
-/*----------------------------------------------------------------*/
-/*
- * Low level disk format
- *
- * Bitmap btree
- * ------------
- *
- * Each value stored in the btree is an index_entry.  This points to a
- * block that is used as a bitmap.  Within the bitmap hold 2 bits per
- * entry, which represent UNUSED = 0, REF_COUNT = 1, REF_COUNT = 2 and
- * REF_COUNT = many.
- *
- * Refcount btree
- * --------------
- *
- * Any entry that has a ref count higher than 2 gets entered in the ref
- * count tree.  The leaf values for this tree is the 32-bit ref count.
- */
-struct disk_index_entry {
-        __le64 blocknr;
-        __le32 nr_free;
-        __le32 none_free_before;
-} __packed;
-#define MAX_METADATA_BITMAPS 255
-struct disk_metadata_index {
-        __le32 csum;
-        __le32 padding;
-        __le64 blocknr;
-        struct disk_index_entry index[MAX_METADATA_BITMAPS];
-} __packed;
-struct ll_disk;
-typedef int (*load_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *result);
-typedef int (*save_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie);
-typedef int (*init_index_fn)(struct ll_disk *ll);
-typedef int (*open_index_fn)(struct ll_disk *ll);
-typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll);
-typedef int (*commit_fn)(struct ll_disk *ll);
-struct ll_disk {
-        struct dm_transaction_manager *tm;
-        struct dm_btree_info bitmap_info;
-        struct dm_btree_info ref_count_info;
-        uint32_t block_size;
-        uint32_t entries_per_block;
-        dm_block_t nr_blocks;
-        dm_block_t nr_allocated;
-        /*
-         * bitmap_root may be a btree root or a simple index.
-         */
-        dm_block_t bitmap_root;
-        dm_block_t ref_count_root;
-        struct disk_metadata_index mi_le;
-        load_ie_fn load_ie;
-        save_ie_fn save_ie;
-        init_index_fn init_index;
-        open_index_fn open_index;
-        max_index_entries_fn max_entries;
-        commit_fn commit;
-        bool bitmap_index_changed:1;
-};
-struct disk_sm_root {
-        __le64 nr_blocks;
-        __le64 nr_allocated;
-        __le64 bitmap_root;
-        __le64 ref_count_root;
-} __packed;
-#define ENTRIES_PER_BYTE 4
-struct disk_bitmap_header {
-        __le32 csum;
-        __le32 not_used;
-        __le64 blocknr;
-} __packed;
-enum allocation_event {
-        SM_NONE,
-        SM_ALLOC,
-        SM_FREE,
-};
-/*----------------------------------------------------------------*/
-int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks);
-int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
-int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
-int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
-                          dm_block_t end, dm_block_t *result);
-int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
-int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
-int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
-int sm_ll_commit(struct ll_disk *ll);
-int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm);
-int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
-                        void *root_le, size_t len);
-int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm);
-int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm,
-                    void *root_le, size_t len);
-/*----------------------------------------------------------------*/
-#endif  /* DM_SPACE_MAP_COMMON_H */
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
deleted file mode 100644
index f6d29e614ab..00000000000
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ /dev/null
@@ -1,318 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#include "dm-space-map-common.h"
-#include "dm-space-map-disk.h"
-#include "dm-space-map.h"
-#include "dm-transaction-manager.h"
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/device-mapper.h>
-#define DM_MSG_PREFIX "space map disk"
-/*----------------------------------------------------------------*/
-/*
- * Space map interface.
- */
-struct sm_disk {
-        struct dm_space_map sm;
-        struct ll_disk ll;
-        struct ll_disk old_ll;
-        dm_block_t begin;
-        dm_block_t nr_allocated_this_transaction;
-};
-static void sm_disk_destroy(struct dm_space_map *sm)
-{
-        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-        kfree(smd);
-}
-static int sm_disk_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
-{
-        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-        return sm_ll_extend(&smd->ll, extra_blocks);
-}
-static int sm_disk_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
-{
-        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-        *count = smd->old_ll.nr_blocks;
-        return 0;
-}
-static int sm_disk_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
-{
-        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-        *count = (smd->old_ll.nr_blocks - smd->old_ll.nr_allocated) - smd->nr_allocated_this_transaction;
-        return 0;
-}
-static int sm_disk_get_count(struct dm_space_map *sm, dm_block_t b,
-                             uint32_t *result)
-{
-        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-        return sm_ll_lookup(&smd->ll, b, result);
-}
-static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b,
-                                          int *result)
-{
-        int r;
-        uint32_t count;
-        r = sm_disk_get_count(sm, b, &count);
-        if (r)
-                return r;
-        return count > 1;
-}
-static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b,
-                             uint32_t count)
-{
-        int r;
-        uint32_t old_count;
-        enum allocation_event ev;
-        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-        r = sm_ll_insert(&smd->ll, b, count, &ev);
-        if (!r) {
-                switch (ev) {
-                case SM_NONE:
-                        break;
-                case SM_ALLOC:
-                        /*
-                         * This _must_ be free in the prior transaction
-                         * otherwise we've lost atomicity.
-                         */
-                        smd->nr_allocated_this_transaction++;
-                        break;
-                case SM_FREE:
-                        /*
-                         * It's only free if it's also free in the last
-                         * transaction.
-                         */
-                        r = sm_ll_lookup(&smd->old_ll, b, &old_count);
-                        if (r)
-                                return r;
-                        if (!old_count)
-                                smd->nr_allocated_this_transaction--;
-                        break;
-                }
-        }
-        return r;
-}
-static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
-{
-        int r;
-        enum allocation_event ev;
-        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-        r = sm_ll_inc(&smd->ll, b, &ev);
-        if (!r && (ev == SM_ALLOC))
-                /*
-                 * This _must_ be free in the prior transaction
-                 * otherwise we've lost atomicity.
-                 */
-                smd->nr_allocated_this_transaction++;
-        return r;
-}
-static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
-{
-        int r;
-        uint32_t old_count;
-        enum allocation_event ev;
-        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-        r = sm_ll_dec(&smd->ll, b, &ev);
-        if (!r && (ev == SM_FREE)) {
-                /*
-                 * It's only free if it's also free in the last
-                 * transaction.
-                 */
-                r = sm_ll_lookup(&smd->old_ll, b, &old_count);
-                if (r)
-                        return r;
-                if (!old_count)
-                        smd->nr_allocated_this_transaction--;
-        }
-        return r;
-}
-static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
-{
-        int r;
-        enum allocation_event ev;
-        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-        /* FIXME: we should loop round a couple of times */
-        r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
-        if (r)
-                return r;
-        smd->begin = *b + 1;
-        r = sm_ll_inc(&smd->ll, *b, &ev);
-        if (!r) {
-                BUG_ON(ev != SM_ALLOC);
-                smd->nr_allocated_this_transaction++;
-        }
-        return r;
-}
-static int sm_disk_commit(struct dm_space_map *sm)
-{
-        int r;
-        dm_block_t nr_free;
-        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-        r = sm_disk_get_nr_free(sm, &nr_free);
-        if (r)
-                return r;
-        r = sm_ll_commit(&smd->ll);
-        if (r)
-                return r;
-        memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll));
-        smd->begin = 0;
-        smd->nr_allocated_this_transaction = 0;
-        r = sm_disk_get_nr_free(sm, &nr_free);
-        if (r)
-                return r;
-        return 0;
-}
-static int sm_disk_root_size(struct dm_space_map *sm, size_t *result)
-{
-        *result = sizeof(struct disk_sm_root);
-        return 0;
-}
-static int sm_disk_copy_root(struct dm_space_map *sm, void *where_le, size_t max)
-{
-        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-        struct disk_sm_root root_le;
-        root_le.nr_blocks = cpu_to_le64(smd->ll.nr_blocks);
-        root_le.nr_allocated = cpu_to_le64(smd->ll.nr_allocated);
-        root_le.bitmap_root = cpu_to_le64(smd->ll.bitmap_root);
-        root_le.ref_count_root = cpu_to_le64(smd->ll.ref_count_root);
-        if (max < sizeof(root_le))
-                return -ENOSPC;
-        memcpy(where_le, &root_le, sizeof(root_le));
-        return 0;
-}
-/*----------------------------------------------------------------*/
-static struct dm_space_map ops = {
-        .destroy = sm_disk_destroy,
-        .extend = sm_disk_extend,
-        .get_nr_blocks = sm_disk_get_nr_blocks,
-        .get_nr_free = sm_disk_get_nr_free,
-        .get_count = sm_disk_get_count,
-        .count_is_more_than_one = sm_disk_count_is_more_than_one,
-        .set_count = sm_disk_set_count,
-        .inc_block = sm_disk_inc_block,
-        .dec_block = sm_disk_dec_block,
-        .new_block = sm_disk_new_block,
-        .commit = sm_disk_commit,
-        .root_size = sm_disk_root_size,
-        .copy_root = sm_disk_copy_root
-};
-struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
-                                       dm_block_t nr_blocks)
-{
-        int r;
-        struct sm_disk *smd;
-        smd = kmalloc(sizeof(*smd), GFP_KERNEL);
-        if (!smd)
-                return ERR_PTR(-ENOMEM);
-        smd->begin = 0;
-        smd->nr_allocated_this_transaction = 0;
-        memcpy(&smd->sm, &ops, sizeof(smd->sm));
-        r = sm_ll_new_disk(&smd->ll, tm);
-        if (r)
-                goto bad;
-        r = sm_ll_extend(&smd->ll, nr_blocks);
-        if (r)
-                goto bad;
-        r = sm_disk_commit(&smd->sm);
-        if (r)
-                goto bad;
-        return &smd->sm;
-bad:
-        kfree(smd);
-        return ERR_PTR(r);
-}
-EXPORT_SYMBOL_GPL(dm_sm_disk_create);
-struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
-                                     void *root_le, size_t len)
-{
-        int r;
-        struct sm_disk *smd;
-        smd = kmalloc(sizeof(*smd), GFP_KERNEL);
-        if (!smd)
-                return ERR_PTR(-ENOMEM);
-        smd->begin = 0;
-        smd->nr_allocated_this_transaction = 0;
-        memcpy(&smd->sm, &ops, sizeof(smd->sm));
-        r = sm_ll_open_disk(&smd->ll, tm, root_le, len);
-        if (r)
-                goto bad;
-        r = sm_disk_commit(&smd->sm);
-        if (r)
-                goto bad;
-        return &smd->sm;
-bad:
-        kfree(smd);
-        return ERR_PTR(r);
-}
-EXPORT_SYMBOL_GPL(dm_sm_disk_open);
-/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-disk.h b/drivers/md/persistent-data/dm-space-map-disk.h
deleted file mode 100644
index 447a0a9a2d9..00000000000
--- a/drivers/md/persistent-data/dm-space-map-disk.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#ifndef _LINUX_DM_SPACE_MAP_DISK_H
-#define _LINUX_DM_SPACE_MAP_DISK_H
-#include "dm-block-manager.h"
-struct dm_space_map;
-struct dm_transaction_manager;
-/*
- * Unfortunately we have to use two-phase construction due to the cycle
- * between the tm and sm.
- */
-struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
-                                       dm_block_t nr_blocks);
-struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
-                                     void *root, size_t len);
-#endif /* _LINUX_DM_SPACE_MAP_DISK_H */
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
deleted file mode 100644
index 906cf3df71a..00000000000
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#include "dm-space-map.h"
-#include "dm-space-map-common.h"
-#include "dm-space-map-metadata.h"
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/device-mapper.h>
-#define DM_MSG_PREFIX "space map metadata"
-/*----------------------------------------------------------------*/
-/*
- * Space map interface.
- *
- * The low level disk format is written using the standard btree and
- * transaction manager.  This means that performing disk operations may
- * cause us to recurse into the space map in order to allocate new blocks.
- * For this reason we have a pool of pre-allocated blocks large enough to
- * service any metadata_ll_disk operation.
- */
-/*
- * FIXME: we should calculate this based on the size of the device.
- * Only the metadata space map needs this functionality.
- */
-#define MAX_RECURSIVE_ALLOCATIONS 1024
-enum block_op_type {
-        BOP_INC,
-        BOP_DEC
-};
-struct block_op {
-        enum block_op_type type;
-        dm_block_t block;
-};
-struct sm_metadata {
-        struct dm_space_map sm;
-        struct ll_disk ll;
-        struct ll_disk old_ll;
-        dm_block_t begin;
-        unsigned recursion_count;
-        unsigned allocated_this_transaction;
-        unsigned nr_uncommitted;
-        struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
-};
-static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
-{
-        struct block_op *op;
-        if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
-                DMERR("too many recursive allocations");
-                return -ENOMEM;
-        }
-        op = smm->uncommitted + smm->nr_uncommitted++;
-        op->type = type;
-        op->block = b;
-        return 0;
-}
-static int commit_bop(struct sm_metadata *smm, struct block_op *op)
-{
-        int r = 0;
-        enum allocation_event ev;
-        switch (op->type) {
-        case BOP_INC:
-                r = sm_ll_inc(&smm->ll, op->block, &ev);
-                break;
-        case BOP_DEC:
-                r = sm_ll_dec(&smm->ll, op->block, &ev);
-                break;
-        }
-        return r;
-}
-static void in(struct sm_metadata *smm)
-{
-        smm->recursion_count++;
-}
-static int out(struct sm_metadata *smm)
-{
-        int r = 0;
-        /*
-         * If we're not recursing then very bad things are happening.
-         */
-        if (!smm->recursion_count) {
-                DMERR("lost track of recursion depth");
-                return -ENOMEM;
-        }
-        if (smm->recursion_count == 1 && smm->nr_uncommitted) {
-                while (smm->nr_uncommitted && !r) {
-                        smm->nr_uncommitted--;
-                        r = commit_bop(smm, smm->uncommitted +
-                                       smm->nr_uncommitted);
-                        if (r)
-                                break;
-                }
-        }
-        smm->recursion_count--;
-        return r;
-}
-/*
- * When using the out() function above, we often want to combine an error
- * code for the operation run in the recursive context with that from
- * out().
- */
-static int combine_errors(int r1, int r2)
-{
-        return r1 ? r1 : r2;
-}
-static int recursing(struct sm_metadata *smm)
-{
-        return smm->recursion_count;
-}
-static void sm_metadata_destroy(struct dm_space_map *sm)
-{
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        kfree(smm);
-}
-static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
-{
-        DMERR("doesn't support extend");
-        return -EINVAL;
-}
-static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
-{
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        *count = smm->ll.nr_blocks;
-        return 0;
-}
-static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
-{
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        *count = smm->old_ll.nr_blocks - smm->old_ll.nr_allocated -
-                 smm->allocated_this_transaction;
-        return 0;
-}
-static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
-                                 uint32_t *result)
-{
-        int r, i;
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        unsigned adjustment = 0;
-        /*
-         * We may have some uncommitted adjustments to add.  This list
-         * should always be really short.
-         */
-        for (i = 0; i < smm->nr_uncommitted; i++) {
-                struct block_op *op = smm->uncommitted + i;
-                if (op->block != b)
-                        continue;
-                switch (op->type) {
-                case BOP_INC:
-                        adjustment++;
-                        break;
-                case BOP_DEC:
-                        adjustment--;
-                        break;
-                }
-        }
-        r = sm_ll_lookup(&smm->ll, b, result);
-        if (r)
-                return r;
-        *result += adjustment;
-        return 0;
-}
-static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
-                                              dm_block_t b, int *result)
-{
-        int r, i, adjustment = 0;
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        uint32_t rc;
-        /*
-         * We may have some uncommitted adjustments to add.  This list
-         * should always be really short.
-         */
-        for (i = 0; i < smm->nr_uncommitted; i++) {
-                struct block_op *op = smm->uncommitted + i;
-                if (op->block != b)
-                        continue;
-                switch (op->type) {
-                case BOP_INC:
-                        adjustment++;
-                        break;
-                case BOP_DEC:
-                        adjustment--;
-                        break;
-                }
-        }
-        if (adjustment > 1) {
-                *result = 1;
-                return 0;
-        }
-        r = sm_ll_lookup_bitmap(&smm->ll, b, &rc);
-        if (r)
-                return r;
-        if (rc == 3)
-                /*
-                 * We err on the side of caution, and always return true.
-                 */
-                *result = 1;
-        else
-                *result = rc + adjustment > 1;
-        return 0;
-}
-static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b,
-                                 uint32_t count)
-{
-        int r, r2;
-        enum allocation_event ev;
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        if (smm->recursion_count) {
-                DMERR("cannot recurse set_count()");
-                return -EINVAL;
-        }
-        in(smm);
-        r = sm_ll_insert(&smm->ll, b, count, &ev);
-        r2 = out(smm);
-        return combine_errors(r, r2);
-}
-static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b)
-{
-        int r, r2 = 0;
-        enum allocation_event ev;
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        if (recursing(smm))
-                r = add_bop(smm, BOP_INC, b);
-        else {
-                in(smm);
-                r = sm_ll_inc(&smm->ll, b, &ev);
-                r2 = out(smm);
-        }
-        return combine_errors(r, r2);
-}
-static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b)
-{
-        int r, r2 = 0;
-        enum allocation_event ev;
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        if (recursing(smm))
-                r = add_bop(smm, BOP_DEC, b);
-        else {
-                in(smm);
-                r = sm_ll_dec(&smm->ll, b, &ev);
-                r2 = out(smm);
-        }
-        return combine_errors(r, r2);
-}
-static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
-{
-        int r, r2 = 0;
-        enum allocation_event ev;
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
-        if (r)
-                return r;
-        smm->begin = *b + 1;
-        if (recursing(smm))
-                r = add_bop(smm, BOP_INC, *b);
-        else {
-                in(smm);
-                r = sm_ll_inc(&smm->ll, *b, &ev);
-                r2 = out(smm);
-        }
-        if (!r)
-                smm->allocated_this_transaction++;
-        return combine_errors(r, r2);
-}
-static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
-{
-        int r = sm_metadata_new_block_(sm, b);
-        if (r)
-                DMERR("unable to allocate new metadata block");
-        return r;
-}
-static int sm_metadata_commit(struct dm_space_map *sm)
-{
-        int r;
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        r = sm_ll_commit(&smm->ll);
-        if (r)
-                return r;
-        memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
-        smm->begin = 0;
-        smm->allocated_this_transaction = 0;
-        return 0;
-}
-static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result)
-{
-        *result = sizeof(struct disk_sm_root);
-        return 0;
-}
-static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t max)
-{
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        struct disk_sm_root root_le;
-        root_le.nr_blocks = cpu_to_le64(smm->ll.nr_blocks);
-        root_le.nr_allocated = cpu_to_le64(smm->ll.nr_allocated);
-        root_le.bitmap_root = cpu_to_le64(smm->ll.bitmap_root);
-        root_le.ref_count_root = cpu_to_le64(smm->ll.ref_count_root);
-        if (max < sizeof(root_le))
-                return -ENOSPC;
-        memcpy(where_le, &root_le, sizeof(root_le));
-        return 0;
-}
-static struct dm_space_map ops = {
-        .destroy = sm_metadata_destroy,
-        .extend = sm_metadata_extend,
-        .get_nr_blocks = sm_metadata_get_nr_blocks,
-        .get_nr_free = sm_metadata_get_nr_free,
-        .get_count = sm_metadata_get_count,
-        .count_is_more_than_one = sm_metadata_count_is_more_than_one,
-        .set_count = sm_metadata_set_count,
-        .inc_block = sm_metadata_inc_block,
-        .dec_block = sm_metadata_dec_block,
-        .new_block = sm_metadata_new_block,
-        .commit = sm_metadata_commit,
-        .root_size = sm_metadata_root_size,
-        .copy_root = sm_metadata_copy_root
-};
-/*----------------------------------------------------------------*/
-/*
- * When a new space map is created that manages its own space.  We use
- * this tiny bootstrap allocator.
- */
-static void sm_bootstrap_destroy(struct dm_space_map *sm)
-{
-}
-static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
-{
-        DMERR("boostrap doesn't support extend");
-        return -EINVAL;
-}
-static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
-{
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        return smm->ll.nr_blocks;
-}
-static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
-{
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        *count = smm->ll.nr_blocks - smm->begin;
-        return 0;
-}
-static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b,
-                                  uint32_t *result)
-{
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        return b < smm->begin ? 1 : 0;
-}
-static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,
-                                               dm_block_t b, int *result)
-{
-        *result = 0;
-        return 0;
-}
-static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b,
-                                  uint32_t count)
-{
-        DMERR("boostrap doesn't support set_count");
-        return -EINVAL;
-}
-static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b)
-{
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        /*
-         * We know the entire device is unused.
-         */
-        if (smm->begin == smm->ll.nr_blocks)
-                return -ENOSPC;
-        *b = smm->begin++;
-        return 0;
-}
-static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b)
-{
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        return add_bop(smm, BOP_INC, b);
-}
-static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b)
-{
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        return add_bop(smm, BOP_DEC, b);
-}
-static int sm_bootstrap_commit(struct dm_space_map *sm)
-{
-        return 0;
-}
-static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result)
-{
-        DMERR("boostrap doesn't support root_size");
-        return -EINVAL;
-}
-static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where,
-                                  size_t max)
-{
-        DMERR("boostrap doesn't support copy_root");
-        return -EINVAL;
-}
-static struct dm_space_map bootstrap_ops = {
-        .destroy = sm_bootstrap_destroy,
-        .extend = sm_bootstrap_extend,
-        .get_nr_blocks = sm_bootstrap_get_nr_blocks,
-        .get_nr_free = sm_bootstrap_get_nr_free,
-        .get_count = sm_bootstrap_get_count,
-        .count_is_more_than_one = sm_bootstrap_count_is_more_than_one,
-        .set_count = sm_bootstrap_set_count,
-        .inc_block = sm_bootstrap_inc_block,
-        .dec_block = sm_bootstrap_dec_block,
-        .new_block = sm_bootstrap_new_block,
-        .commit = sm_bootstrap_commit,
-        .root_size = sm_bootstrap_root_size,
-        .copy_root = sm_bootstrap_copy_root
-};
-/*----------------------------------------------------------------*/
-struct dm_space_map *dm_sm_metadata_init(void)
-{
-        struct sm_metadata *smm;
-        smm = kmalloc(sizeof(*smm), GFP_KERNEL);
-        if (!smm)
-                return ERR_PTR(-ENOMEM);
-        memcpy(&smm->sm, &ops, sizeof(smm->sm));
-        return &smm->sm;
-}
-int dm_sm_metadata_create(struct dm_space_map *sm,
-                          struct dm_transaction_manager *tm,
-                          dm_block_t nr_blocks,
-                          dm_block_t superblock)
-{
-        int r;
-        dm_block_t i;
-        enum allocation_event ev;
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        smm->begin = superblock + 1;
-        smm->recursion_count = 0;
-        smm->allocated_this_transaction = 0;
-        smm->nr_uncommitted = 0;
-        memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
-        r = sm_ll_new_metadata(&smm->ll, tm);
-        if (r)
-                return r;
-        r = sm_ll_extend(&smm->ll, nr_blocks);
-        if (r)
-                return r;
-        memcpy(&smm->sm, &ops, sizeof(smm->sm));
-        /*
-         * Now we need to update the newly created data structures with the
-         * allocated blocks that they were built from.
-         */
-        for (i = superblock; !r && i < smm->begin; i++)
-                r = sm_ll_inc(&smm->ll, i, &ev);
-        if (r)
-                return r;
-        return sm_metadata_commit(sm);
-}
-int dm_sm_metadata_open(struct dm_space_map *sm,
-                        struct dm_transaction_manager *tm,
-                        void *root_le, size_t len)
-{
-        int r;
-        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
-        r = sm_ll_open_metadata(&smm->ll, tm, root_le, len);
-        if (r)
-                return r;
-        smm->begin = 0;
-        smm->recursion_count = 0;
-        smm->allocated_this_transaction = 0;
-        smm->nr_uncommitted = 0;
-        memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
-        return 0;
-}
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
deleted file mode 100644
index 39bba0801cf..00000000000
--- a/drivers/md/persistent-data/dm-space-map-metadata.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#ifndef DM_SPACE_MAP_METADATA_H
-#define DM_SPACE_MAP_METADATA_H
-#include "dm-transaction-manager.h"
-/*
- * Unfortunately we have to use two-phase construction due to the cycle
- * between the tm and sm.
- */
-struct dm_space_map *dm_sm_metadata_init(void);
-/*
- * Create a fresh space map.
- */
-int dm_sm_metadata_create(struct dm_space_map *sm,
-                          struct dm_transaction_manager *tm,
-                          dm_block_t nr_blocks,
-                          dm_block_t superblock);
-/*
- * Open from a previously-recorded root.
- */
-int dm_sm_metadata_open(struct dm_space_map *sm,
-                        struct dm_transaction_manager *tm,
-                        void *root_le, size_t len);
-#endif  /* DM_SPACE_MAP_METADATA_H */
diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h
deleted file mode 100644
index 1cbfc6b1638..00000000000
--- a/drivers/md/persistent-data/dm-space-map.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#ifndef _LINUX_DM_SPACE_MAP_H
-#define _LINUX_DM_SPACE_MAP_H
-#include "dm-block-manager.h"
-/*
- * struct dm_space_map keeps a record of how many times each block in a device
- * is referenced.  It needs to be fixed on disk as part of the transaction.
- */
-struct dm_space_map {
-        void (*destroy)(struct dm_space_map *sm);
-        /*
-         * You must commit before allocating the newly added space.
-         */
-        int (*extend)(struct dm_space_map *sm, dm_block_t extra_blocks);
-        /*
-         * Extensions do not appear in this count until after commit has
-         * been called.
-         */
-        int (*get_nr_blocks)(struct dm_space_map *sm, dm_block_t *count);
-        /*
-         * Space maps must never allocate a block from the previous
-         * transaction, in case we need to rollback.  This complicates the
-         * semantics of get_nr_free(), it should return the number of blocks
-         * that are available for allocation _now_.  For instance you may
-         * have blocks with a zero reference count that will not be
-         * available for allocation until after the next commit.
-         */
-        int (*get_nr_free)(struct dm_space_map *sm, dm_block_t *count);
-        int (*get_count)(struct dm_space_map *sm, dm_block_t b, uint32_t *result);
-        int (*count_is_more_than_one)(struct dm_space_map *sm, dm_block_t b,
-                                      int *result);
-        int (*set_count)(struct dm_space_map *sm, dm_block_t b, uint32_t count);
-        int (*commit)(struct dm_space_map *sm);
-        int (*inc_block)(struct dm_space_map *sm, dm_block_t b);
-        int (*dec_block)(struct dm_space_map *sm, dm_block_t b);
-        /*
-         * new_block will increment the returned block.
-         */
-        int (*new_block)(struct dm_space_map *sm, dm_block_t *b);
-        /*
-         * The root contains all the information needed to fix the space map.
-         * Generally this info is small, so squirrel it away in a disk block
-         * along with other info.
-         */
-        int (*root_size)(struct dm_space_map *sm, size_t *result);
-        int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len);
-};
-/*----------------------------------------------------------------*/
-static inline void dm_sm_destroy(struct dm_space_map *sm)
-{
-        sm->destroy(sm);
-}
-static inline int dm_sm_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
-{
-        return sm->extend(sm, extra_blocks);
-}
-static inline int dm_sm_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
-{
-        return sm->get_nr_blocks(sm, count);
-}
-static inline int dm_sm_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
-{
-        return sm->get_nr_free(sm, count);
-}
-static inline int dm_sm_get_count(struct dm_space_map *sm, dm_block_t b,
-                                  uint32_t *result)
-{
-        return sm->get_count(sm, b, result);
-}
-static inline int dm_sm_count_is_more_than_one(struct dm_space_map *sm,
-                                               dm_block_t b, int *result)
-{
-        return sm->count_is_more_than_one(sm, b, result);
-}
-static inline int dm_sm_set_count(struct dm_space_map *sm, dm_block_t b,
-                                  uint32_t count)
-{
-        return sm->set_count(sm, b, count);
-}
-static inline int dm_sm_commit(struct dm_space_map *sm)
-{
-        return sm->commit(sm);
-}
-static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b)
-{
-        return sm->inc_block(sm, b);
-}
-static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b)
-{
-        return sm->dec_block(sm, b);
-}
-static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b)
-{
-        return sm->new_block(sm, b);
-}
-static inline int dm_sm_root_size(struct dm_space_map *sm, size_t *result)
-{
-        return sm->root_size(sm, result);
-}
-static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
-{
-        return sm->copy_root(sm, copy_to_here_le, len);
-}
-#endif  /* _LINUX_DM_SPACE_MAP_H */
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
deleted file mode 100644
index d247a35da3c..00000000000
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#include "dm-transaction-manager.h"
-#include "dm-space-map.h"
-#include "dm-space-map-disk.h"
-#include "dm-space-map-metadata.h"
-#include "dm-persistent-data-internal.h"
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/device-mapper.h>
-#define DM_MSG_PREFIX "transaction manager"
-/*----------------------------------------------------------------*/
-struct shadow_info {
-        struct hlist_node hlist;
-        dm_block_t where;
-};
-/*
- * It would be nice if we scaled with the size of transaction.
- */
-#define HASH_SIZE 256
-#define HASH_MASK (HASH_SIZE - 1)
-struct dm_transaction_manager {
-        int is_clone;
-        struct dm_transaction_manager *real;
-        struct dm_block_manager *bm;
-        struct dm_space_map *sm;
-        spinlock_t lock;
-        struct hlist_head buckets[HASH_SIZE];
-};
-/*----------------------------------------------------------------*/
-static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
-{
-        int r = 0;
-        unsigned bucket = dm_hash_block(b, HASH_MASK);
-        struct shadow_info *si;
-        struct hlist_node *n;
-        spin_lock(&tm->lock);
-        hlist_for_each_entry(si, n, tm->buckets + bucket, hlist)
-                if (si->where == b) {
-                        r = 1;
-                        break;
-                }
-        spin_unlock(&tm->lock);
-        return r;
-}
-/*
- * This can silently fail if there's no memory.  We're ok with this since
- * creating redundant shadows causes no harm.
- */
-static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
-{
-        unsigned bucket;
-        struct shadow_info *si;
-        si = kmalloc(sizeof(*si), GFP_NOIO);
-        if (si) {
-                si->where = b;
-                bucket = dm_hash_block(b, HASH_MASK);
-                spin_lock(&tm->lock);
-                hlist_add_head(&si->hlist, tm->buckets + bucket);
-                spin_unlock(&tm->lock);
-        }
-}
-static void wipe_shadow_table(struct dm_transaction_manager *tm)
-{
-        struct shadow_info *si;
-        struct hlist_node *n, *tmp;
-        struct hlist_head *bucket;
-        int i;
-        spin_lock(&tm->lock);
-        for (i = 0; i < HASH_SIZE; i++) {
-                bucket = tm->buckets + i;
-                hlist_for_each_entry_safe(si, n, tmp, bucket, hlist)
-                        kfree(si);
-                INIT_HLIST_HEAD(bucket);
-        }
-        spin_unlock(&tm->lock);
-}
-/*----------------------------------------------------------------*/
-static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
-                                                   struct dm_space_map *sm)
-{
-        int i;
-        struct dm_transaction_manager *tm;
-        tm = kmalloc(sizeof(*tm), GFP_KERNEL);
-        if (!tm)
-                return ERR_PTR(-ENOMEM);
-        tm->is_clone = 0;
-        tm->real = NULL;
-        tm->bm = bm;
-        tm->sm = sm;
-        spin_lock_init(&tm->lock);
-        for (i = 0; i < HASH_SIZE; i++)
-                INIT_HLIST_HEAD(tm->buckets + i);
-        return tm;
-}
-struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real)
-{
-        struct dm_transaction_manager *tm;
-        tm = kmalloc(sizeof(*tm), GFP_KERNEL);
-        if (tm) {
-                tm->is_clone = 1;
-                tm->real = real;
-        }
-        return tm;
-}
-EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone);
-void dm_tm_destroy(struct dm_transaction_manager *tm)
-{
-        if (!tm->is_clone)
-                wipe_shadow_table(tm);
-        kfree(tm);
-}
-EXPORT_SYMBOL_GPL(dm_tm_destroy);
-int dm_tm_pre_commit(struct dm_transaction_manager *tm)
-{
-        int r;
-        if (tm->is_clone)
-                return -EWOULDBLOCK;
-        r = dm_sm_commit(tm->sm);
-        if (r < 0)
-                return r;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(dm_tm_pre_commit);
-int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
-{
-        if (tm->is_clone)
-                return -EWOULDBLOCK;
-        wipe_shadow_table(tm);
-        return dm_bm_flush_and_unlock(tm->bm, root);
-}
-EXPORT_SYMBOL_GPL(dm_tm_commit);
-int dm_tm_new_block(struct dm_transaction_manager *tm,
-                    struct dm_block_validator *v,
-                    struct dm_block **result)
-{
-        int r;
-        dm_block_t new_block;
-        if (tm->is_clone)
-                return -EWOULDBLOCK;
-        r = dm_sm_new_block(tm->sm, &new_block);
-        if (r < 0)
-                return r;
-        r = dm_bm_write_lock_zero(tm->bm, new_block, v, result);
-        if (r < 0) {
-                dm_sm_dec_block(tm->sm, new_block);
-                return r;
-        }
-        /*
-         * New blocks count as shadows in that they don't need to be
-         * shadowed again.
-         */
-        insert_shadow(tm, new_block);
-        return 0;
-}
-static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
-                          struct dm_block_validator *v,
-                          struct dm_block **result)
-{
-        int r;
-        dm_block_t new;
-        struct dm_block *orig_block;
-        r = dm_sm_new_block(tm->sm, &new);
-        if (r < 0)
-                return r;
-        r = dm_sm_dec_block(tm->sm, orig);
-        if (r < 0)
-                return r;
-        r = dm_bm_read_lock(tm->bm, orig, v, &orig_block);
-        if (r < 0)
-                return r;
-        /*
-         * It would be tempting to use dm_bm_unlock_move here, but some
-         * code, such as the space maps, keeps using the old data structures
-         * secure in the knowledge they won't be changed until the next
-         * transaction.  Using unlock_move would force a synchronous read
-         * since the old block would no longer be in the cache.
-         */
-        r = dm_bm_write_lock_zero(tm->bm, new, v, result);
-        if (r) {
-                dm_bm_unlock(orig_block);
-                return r;
-        }
-        memcpy(dm_block_data(*result), dm_block_data(orig_block),
-               dm_bm_block_size(tm->bm));
-        dm_bm_unlock(orig_block);
-        return r;
-}
-int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
-                       struct dm_block_validator *v, struct dm_block **result,
-                       int *inc_children)
-{
-        int r;
-        if (tm->is_clone)
-                return -EWOULDBLOCK;
-        r = dm_sm_count_is_more_than_one(tm->sm, orig, inc_children);
-        if (r < 0)
-                return r;
-        if (is_shadow(tm, orig) && !*inc_children)
-                return dm_bm_write_lock(tm->bm, orig, v, result);
-        r = __shadow_block(tm, orig, v, result);
-        if (r < 0)
-                return r;
-        insert_shadow(tm, dm_block_location(*result));
-        return r;
-}
-EXPORT_SYMBOL_GPL(dm_tm_shadow_block);
-int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
-                    struct dm_block_validator *v,
-                    struct dm_block **blk)
-{
-        if (tm->is_clone)
-                return dm_bm_read_try_lock(tm->real->bm, b, v, blk);
-        return dm_bm_read_lock(tm->bm, b, v, blk);
-}
-EXPORT_SYMBOL_GPL(dm_tm_read_lock);
-int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b)
-{
-        return dm_bm_unlock(b);
-}
-EXPORT_SYMBOL_GPL(dm_tm_unlock);
-void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b)
-{
-        /*
-         * The non-blocking clone doesn't support this.
-         */
-        BUG_ON(tm->is_clone);
-        dm_sm_inc_block(tm->sm, b);
-}
-EXPORT_SYMBOL_GPL(dm_tm_inc);
-void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b)
-{
-        /*
-         * The non-blocking clone doesn't support this.
-         */
-        BUG_ON(tm->is_clone);
-        dm_sm_dec_block(tm->sm, b);
-}
-EXPORT_SYMBOL_GPL(dm_tm_dec);
-int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
-              uint32_t *result)
-{
-        if (tm->is_clone)
-                return -EWOULDBLOCK;
-        return dm_sm_get_count(tm->sm, b, result);
-}
-struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
-{
-        return tm->bm;
-}
-/*----------------------------------------------------------------*/
-static int dm_tm_create_internal(struct dm_block_manager *bm,
-                                 dm_block_t sb_location,
-                                 struct dm_transaction_manager **tm,
-                                 struct dm_space_map **sm,
-                                 int create,
-                                 void *sm_root, size_t sm_len)
-{
-        int r;
-        *sm = dm_sm_metadata_init();
-        if (IS_ERR(*sm))
-                return PTR_ERR(*sm);
-        *tm = dm_tm_create(bm, *sm);
-        if (IS_ERR(*tm)) {
-                dm_sm_destroy(*sm);
-                return PTR_ERR(*tm);
-        }
-        if (create) {
-                r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm),
-                                          sb_location);
-                if (r) {
-                        DMERR("couldn't create metadata space map");
-                        goto bad;
-                }
-        } else {
-                r = dm_sm_metadata_open(*sm, *tm, sm_root, sm_len);
-                if (r) {
-                        DMERR("couldn't open metadata space map");
-                        goto bad;
-                }
-        }
-        return 0;
-bad:
-        dm_tm_destroy(*tm);
-        dm_sm_destroy(*sm);
-        return r;
-}
-int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
-                         struct dm_transaction_manager **tm,
-                         struct dm_space_map **sm)
-{
-        return dm_tm_create_internal(bm, sb_location, tm, sm, 1, NULL, 0);
-}
-EXPORT_SYMBOL_GPL(dm_tm_create_with_sm);
-int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
-                       void *sm_root, size_t root_len,
-                       struct dm_transaction_manager **tm,
-                       struct dm_space_map **sm)
-{
-        return dm_tm_create_internal(bm, sb_location, tm, sm, 0, sm_root, root_len);
-}
-EXPORT_SYMBOL_GPL(dm_tm_open_with_sm);
-/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
deleted file mode 100644
index b5b139076ca..00000000000
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-#ifndef _LINUX_DM_TRANSACTION_MANAGER_H
-#define _LINUX_DM_TRANSACTION_MANAGER_H
-#include "dm-block-manager.h"
-struct dm_transaction_manager;
-struct dm_space_map;
-/*----------------------------------------------------------------*/
-/*
- * This manages the scope of a transaction.  It also enforces immutability
- * of the on-disk data structures by limiting access to writeable blocks.
- *
- * Clients should not fiddle with the block manager directly.
- */
-void dm_tm_destroy(struct dm_transaction_manager *tm);
-/*
- * The non-blocking version of a transaction manager is intended for use in
- * fast path code that needs to do lookups e.g. a dm mapping function.
- * You create the non-blocking variant from a normal tm.  The interface is
- * the same, except that most functions will just return -EWOULDBLOCK.
- * Methods that return void yet may block should not be called on a clone
- * viz. dm_tm_inc, dm_tm_dec.  Call dm_tm_destroy() as you would with a normal
- * tm when you've finished with it.  You may not destroy the original prior
- * to clones.
- */
-struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real);
-/*
- * We use a 2-phase commit here.
- *
- * i) In the first phase the block manager is told to start flushing, and
- * the changes to the space map are written to disk.  You should interrogate
- * your particular space map to get detail of its root node etc. to be
- * included in your superblock.
- *
- * ii) @root will be committed last.  You shouldn't use more than the
- * first 512 bytes of @root if you wish the transaction to survive a power
- * failure.  You *must* have a write lock held on @root for both stage (i)
- * and (ii).  The commit will drop the write lock.
- */
-int dm_tm_pre_commit(struct dm_transaction_manager *tm);
-int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root);
-/*
- * These methods are the only way to get hold of a writeable block.
- */
-/*
- * dm_tm_new_block() is pretty self-explanatory.  Make sure you do actually
- * write to the whole of @data before you unlock, otherwise you could get
- * a data leak.  (The other option is for tm_new_block() to zero new blocks
- * before handing them out, which will be redundant in most, if not all,
- * cases).
- * Zeroes the new block and returns with write lock held.
- */
-int dm_tm_new_block(struct dm_transaction_manager *tm,
-                    struct dm_block_validator *v,
-                    struct dm_block **result);
-/*
- * dm_tm_shadow_block() allocates a new block and copies the data from @orig
- * to it.  It then decrements the reference count on original block.  Use
- * this to update the contents of a block in a data structure, don't
- * confuse this with a clone - you shouldn't access the orig block after
- * this operation.  Because the tm knows the scope of the transaction it
- * can optimise requests for a shadow of a shadow to a no-op.  Don't forget
- * to unlock when you've finished with the shadow.
- *
- * The @inc_children flag is used to tell the caller whether it needs to
- * adjust reference counts for children.  (Data in the block may refer to
- * other blocks.)
- *
- * Shadowing implicitly drops a reference on @orig so you must not have
- * it locked when you call this.
- */
-int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
-                       struct dm_block_validator *v,
-                       struct dm_block **result, int *inc_children);
-/*
- * Read access.  You can lock any block you want.  If there's a write lock
- * on it outstanding then it'll block.
- */
-int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
-                    struct dm_block_validator *v,
-                    struct dm_block **result);
-int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
-/*
- * Functions for altering the reference count of a block directly.
- */
-void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b);
-void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b);
-int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
-              uint32_t *result);
-struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
-/*
- * A little utility that ties the knot by producing a transaction manager
- * that has a space map managed by the transaction manager...
- *
- * Returns a tm that has an open transaction to write the new disk sm.
- * Caller should store the new sm root and commit.
- *
- * The superblock location is passed so the metadata space map knows it
- * shouldn't be used.
- */
-int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
-                         struct dm_transaction_manager **tm,
-                         struct dm_space_map **sm);
-int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
-                       void *sm_root, size_t root_len,
-                       struct dm_transaction_manager **tm,
-                       struct dm_space_map **sm);
-#endif  /* _LINUX_DM_TRANSACTION_MANAGER_H */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 24b359717a7..e86bf3682e1 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -20,7 +20,6 @@
 #include <linux/blkdev.h>
 #include <linux/seq_file.h>
-#include <linux/module.h>
 #include <linux/slab.h>
 #include "md.h"
 #include "raid0.h"
@@ -28,9 +27,9 @@
 static int raid0_congested(void *data, int bits)
 {
-        struct mddev *mddev = data;
+        mddev_t *mddev = data;
-        struct r0conf *conf = mddev->private;
+        raid0_conf_t *conf = mddev->private;
-        struct md_rdev **devlist = conf->devlist;
+        mdk_rdev_t **devlist = conf->devlist;
        int raid_disks = conf->strip_zone[0].nb_dev;
        int i, ret = 0;
@@ -48,54 +47,52 @@ static int raid0_congested(void *data, int bits)
 /*
 * inform the user of the raid configuration
 */
-static void dump_zones(struct mddev *mddev)
+static void dump_zones(mddev_t *mddev)
 {
-        int j, k;
+        int j, k, h;
        sector_t zone_size = 0;
        sector_t zone_start = 0;
        char b[BDEVNAME_SIZE];
-        struct r0conf *conf = mddev->private;
+        raid0_conf_t *conf = mddev->private;
        int raid_disks = conf->strip_zone[0].nb_dev;
-        printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n",
+        printk(KERN_INFO "******* %s configuration *********\n",
-               mdname(mddev),
+                mdname(mddev));
-               conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s");
+        h = 0;
        for (j = 0; j < conf->nr_strip_zones; j++) {
-                printk(KERN_INFO "md: zone%d=[", j);
+                printk(KERN_INFO "zone%d=[", j);
                for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
-                        printk(KERN_CONT "%s%s", k?"/":"",
+                        printk(KERN_CONT "%s/",
                        bdevname(conf->devlist[j*raid_disks
                                                + k]->bdev, b));
                printk(KERN_CONT "]\n");
                zone_size  = conf->strip_zone[j].zone_end - zone_start;
-                printk(KERN_INFO "      zone-offset=%10lluKB, "
+                printk(KERN_INFO "        zone offset=%llukb "
-                                "device-offset=%10lluKB, size=%10lluKB\n",
+                                "device offset=%llukb size=%llukb\n",
                        (unsigned long long)zone_start>>1,
                        (unsigned long long)conf->strip_zone[j].dev_start>>1,
                        (unsigned long long)zone_size>>1);
                zone_start = conf->strip_zone[j].zone_end;
        }
-        printk(KERN_INFO "\n");
+        printk(KERN_INFO "**********************************\n\n");
 }
-static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
+static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 {
        int i, c, err;
        sector_t curr_zone_end, sectors;
-        struct md_rdev *smallest, *rdev1, *rdev2, *rdev, **dev;
+        mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
        struct strip_zone *zone;
        int cnt;
        char b[BDEVNAME_SIZE];
-        char b2[BDEVNAME_SIZE];
+        raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
-        struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
-        bool discard_supported = false;
        if (!conf)
                return -ENOMEM;
-        rdev_for_each(rdev1, mddev) {
+        list_for_each_entry(rdev1, &mddev->disks, same_set) {
-                pr_debug("md/raid0:%s: looking at %s\n",
+                printk(KERN_INFO "md/raid0:%s: looking at %s\n",
-                         mdname(mddev),
+                       mdname(mddev),
-                         bdevname(rdev1->bdev, b));
+                       bdevname(rdev1->bdev, b));
                c = 0;
                /* round size to chunk_size */
@@ -103,17 +100,17 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
                sector_div(sectors, mddev->chunk_sectors);
                rdev1->sectors = sectors * mddev->chunk_sectors;
-                rdev_for_each(rdev2, mddev) {
+                list_for_each_entry(rdev2, &mddev->disks, same_set) {
-                        pr_debug("md/raid0:%s:   comparing %s(%llu)"
+                        printk(KERN_INFO "md/raid0:%s:   comparing %s(%llu)",
-                                 " with %s(%llu)\n",
+                               mdname(mddev),
-                                 mdname(mddev),
+                               bdevname(rdev1->bdev,b),
-                                 bdevname(rdev1->bdev,b),
+                               (unsigned long long)rdev1->sectors);
-                                 (unsigned long long)rdev1->sectors,
+                        printk(KERN_CONT " with %s(%llu)\n",
-                                 bdevname(rdev2->bdev,b2),
+                               bdevname(rdev2->bdev,b),
-                                 (unsigned long long)rdev2->sectors);
+                               (unsigned long long)rdev2->sectors);
                        if (rdev2 == rdev1) {
-                                pr_debug("md/raid0:%s:   END\n",
+                                printk(KERN_INFO "md/raid0:%s:   END\n",
-                                         mdname(mddev));
+                                       mdname(mddev));
                                break;
                        }
                        if (rdev2->sectors == rdev1->sectors) {
@@ -121,30 +118,30 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
                                 * Not unique, don't count it as a new
                                 * group
                                 */
-                                pr_debug("md/raid0:%s:   EQUAL\n",
+                                printk(KERN_INFO "md/raid0:%s:   EQUAL\n",
-                                         mdname(mddev));
+                                       mdname(mddev));
                                c = 1;
                                break;
                        }
-                        pr_debug("md/raid0:%s:   NOT EQUAL\n",
+                        printk(KERN_INFO "md/raid0:%s:   NOT EQUAL\n",
-                                 mdname(mddev));
+                               mdname(mddev));
                }
                if (!c) {
-                        pr_debug("md/raid0:%s:   ==> UNIQUE\n",
+                        printk(KERN_INFO "md/raid0:%s:   ==> UNIQUE\n",
-                                 mdname(mddev));
+                               mdname(mddev));
                        conf->nr_strip_zones++;
-                        pr_debug("md/raid0:%s: %d zones\n",
+                        printk(KERN_INFO "md/raid0:%s: %d zones\n",
-                                 mdname(mddev), conf->nr_strip_zones);
+                               mdname(mddev), conf->nr_strip_zones);
                }
        }
-        pr_debug("md/raid0:%s: FINAL %d zones\n",
+        printk(KERN_INFO "md/raid0:%s: FINAL %d zones\n",
-                 mdname(mddev), conf->nr_strip_zones);
+               mdname(mddev), conf->nr_strip_zones);
        err = -ENOMEM;
        conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
                                conf->nr_strip_zones, GFP_KERNEL);
        if (!conf->strip_zone)
                goto abort;
-        conf->devlist = kzalloc(sizeof(struct md_rdev*)*
+        conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
                                conf->nr_strip_zones*mddev->raid_disks,
                                GFP_KERNEL);
        if (!conf->devlist)
@@ -158,7 +155,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
        smallest = NULL;
        dev = conf->devlist;
        err = -EINVAL;
-        rdev_for_each(rdev1, mddev) {
+        list_for_each_entry(rdev1, &mddev->disks, same_set) {
                int j = rdev1->raid_disk;
                if (mddev->level == 10) {
@@ -189,16 +186,19 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
                disk_stack_limits(mddev->gendisk, rdev1->bdev,
                                  rdev1->data_offset << 9);
+                /* as we don't honour merge_bvec_fn, we must never risk
+                 * violating it, so limit ->max_segments to 1, lying within
+                 * a single page.
+                 */
-                if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
+                if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) {
-                        conf->has_merge_bvec = 1;
+                        blk_queue_max_segments(mddev->queue, 1);
+                        blk_queue_segment_boundary(mddev->queue,
+                                                   PAGE_CACHE_SIZE - 1);
+                }
                if (!smallest || (rdev1->sectors < smallest->sectors))
                        smallest = rdev1;
                cnt++;
-                if (blk_queue_discard(bdev_get_queue(rdev1->bdev)))
-                        discard_supported = true;
        }
        if (cnt != mddev->raid_disks) {
                printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
@@ -218,45 +218,44 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
                zone = conf->strip_zone + i;
                dev = conf->devlist + i * mddev->raid_disks;
-                pr_debug("md/raid0:%s: zone %d\n", mdname(mddev), i);
+                printk(KERN_INFO "md/raid0:%s: zone %d\n",
+                       mdname(mddev), i);
                zone->dev_start = smallest->sectors;
                smallest = NULL;
                c = 0;
                for (j=0; j<cnt; j++) {
                        rdev = conf->devlist[j];
+                        printk(KERN_INFO "md/raid0:%s: checking %s ...",
+                               mdname(mddev),
+                               bdevname(rdev->bdev, b));
                        if (rdev->sectors <= zone->dev_start) {
-                                pr_debug("md/raid0:%s: checking %s ... nope\n",
+                                printk(KERN_CONT " nope.\n");
-                                         mdname(mddev),
-                                         bdevname(rdev->bdev, b));
                                continue;
                        }
-                        pr_debug("md/raid0:%s: checking %s ..."
+                        printk(KERN_CONT " contained as device %d\n", c);
-                                 " contained as device %d\n",
-                                 mdname(mddev),
-                                 bdevname(rdev->bdev, b), c);
                        dev[c] = rdev;
                        c++;
                        if (!smallest || rdev->sectors < smallest->sectors) {
                                smallest = rdev;
-                                pr_debug("md/raid0:%s:  (%llu) is smallest!.\n",
+                                printk(KERN_INFO "md/raid0:%s:  (%llu) is smallest!.\n",
-                                         mdname(mddev),
+                                       mdname(mddev),
-                                         (unsigned long long)rdev->sectors);
+                                       (unsigned long long)rdev->sectors);
                        }
                }
                zone->nb_dev = c;
                sectors = (smallest->sectors - zone->dev_start) * c;
-                pr_debug("md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n",
+                printk(KERN_INFO "md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n",
-                         mdname(mddev),
+                       mdname(mddev),
-                         zone->nb_dev, (unsigned long long)sectors);
+                       zone->nb_dev, (unsigned long long)sectors);
                curr_zone_end += sectors;
                zone->zone_end = curr_zone_end;
-                pr_debug("md/raid0:%s: current zone start: %llu\n",
+                printk(KERN_INFO "md/raid0:%s: current zone start: %llu\n",
-                         mdname(mddev),
+                       mdname(mddev),
-                         (unsigned long long)smallest->sectors);
+                       (unsigned long long)smallest->sectors);
        }
        mddev->queue->backing_dev_info.congested_fn = raid0_congested;
        mddev->queue->backing_dev_info.congested_data = mddev;
@@ -276,12 +275,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
        blk_queue_io_opt(mddev->queue,
                         (mddev->chunk_sectors << 9) * mddev->raid_disks);
-        if (!discard_supported)
+        printk(KERN_INFO "md/raid0:%s: done.\n", mdname(mddev));
-                queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
-        else
-                queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
-        pr_debug("md/raid0:%s: done.\n", mdname(mddev));
        *private_conf = conf;
        return 0;
@@ -293,64 +287,8 @@ abort:
        return err;
 }
-/* Find the zone which holds a particular offset
- * Update *sectorp to be an offset in that zone
- */
-static struct strip_zone *find_zone(struct r0conf *conf,
-                                    sector_t *sectorp)
-{
-        int i;
-        struct strip_zone *z = conf->strip_zone;
-        sector_t sector = *sectorp;
-        for (i = 0; i < conf->nr_strip_zones; i++)
-                if (sector < z[i].zone_end) {
-                        if (i)
-                                *sectorp = sector - z[i-1].zone_end;
-                        return z + i;
-                }
-        BUG();
-}
-/*
- * remaps the bio to the target device. we separate two flows.
- * power 2 flow and a general flow for the sake of perfromance
-*/
-static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
-                                sector_t sector, sector_t *sector_offset)
-{
-        unsigned int sect_in_chunk;
-        sector_t chunk;
-        struct r0conf *conf = mddev->private;
-        int raid_disks = conf->strip_zone[0].nb_dev;
-        unsigned int chunk_sects = mddev->chunk_sectors;
-        if (is_power_of_2(chunk_sects)) {
-                int chunksect_bits = ffz(~chunk_sects);
-                /* find the sector offset inside the chunk */
-                sect_in_chunk  = sector & (chunk_sects - 1);
-                sector >>= chunksect_bits;
-                /* chunk in zone */
-                chunk = *sector_offset;
-                /* quotient is the chunk in real device*/
-                sector_div(chunk, zone->nb_dev << chunksect_bits);
-        } else{
-                sect_in_chunk = sector_div(sector, chunk_sects);
-                chunk = *sector_offset;
-                sector_div(chunk, chunk_sects * zone->nb_dev);
-        }
-        /*
-        *  position the bio over the real device
-        *  real sector = chunk in device + starting of zone
-        *       + the position in the chunk
-        */
-        *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
-        return conf->devlist[(zone - conf->strip_zone)*raid_disks
-                             + sector_div(sector, zone->nb_dev)];
-}
 /**
- *      raid0_mergeable_bvec -- tell bio layer if two requests can be merged
+ *      raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
 *      @q: request queue
 *      @bvm: properties of new bio
 *      @biovec: the request that could be merged to it.
@@ -361,16 +299,11 @@ static int raid0_mergeable_bvec(struct request_queue *q,
                                struct bvec_merge_data *bvm,
                                struct bio_vec *biovec)
 {
-        struct mddev *mddev = q->queuedata;
+        mddev_t *mddev = q->queuedata;
-        struct r0conf *conf = mddev->private;
        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
-        sector_t sector_offset = sector;
        int max;
        unsigned int chunk_sectors = mddev->chunk_sectors;
        unsigned int bio_sectors = bvm->bi_size >> 9;
-        struct strip_zone *zone;
-        struct md_rdev *rdev;
-        struct request_queue *subq;
        if (is_power_of_2(chunk_sectors))
                max =  (chunk_sectors - ((sector & (chunk_sectors-1))
@@ -378,49 +311,30 @@ static int raid0_mergeable_bvec(struct request_queue *q,
        else
                max =  (chunk_sectors - (sector_div(sector, chunk_sectors)
                                                + bio_sectors)) << 9;
-        if (max < 0)
+        if (max < 0) max = 0; /* bio_add cannot handle a negative return */
-                max = 0; /* bio_add cannot handle a negative return */
        if (max <= biovec->bv_len && bio_sectors == 0)
                return biovec->bv_len;
-        if (max < biovec->bv_len)
+        else 
-                /* too small already, no need to check further */
-                return max;
-        if (!conf->has_merge_bvec)
-                return max;
-        /* May need to check subordinate device */
-        sector = sector_offset;
-        zone = find_zone(mddev->private, &sector_offset);
-        rdev = map_sector(mddev, zone, sector, &sector_offset);
-        subq = bdev_get_queue(rdev->bdev);
-        if (subq->merge_bvec_fn) {
-                bvm->bi_bdev = rdev->bdev;
-                bvm->bi_sector = sector_offset + zone->dev_start +
-                        rdev->data_offset;
-                return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
-        } else
                return max;
 }
-static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
        sector_t array_sectors = 0;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        WARN_ONCE(sectors || raid_disks,
                  "%s does not support generic reshape\n", __func__);
-        rdev_for_each(rdev, mddev)
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                array_sectors += rdev->sectors;
        return array_sectors;
 }
-static int raid0_stop(struct mddev *mddev);
+static int raid0_run(mddev_t *mddev)
-static int raid0_run(struct mddev *mddev)
 {
-        struct r0conf *conf;
+        raid0_conf_t *conf;
        int ret;
        if (mddev->chunk_sectors == 0) {
@@ -431,8 +345,6 @@ static int raid0_run(struct mddev *mddev)
        if (md_check_no_bitmap(mddev))
                return -EINVAL;
        blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
-        blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
-        blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
        /* if private is not null, we are here after takeover */
        if (mddev->private == NULL) {
@@ -467,17 +379,12 @@ static int raid0_run(struct mddev *mddev)
        blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
        dump_zones(mddev);
+        return md_integrity_register(mddev);
-        ret = md_integrity_register(mddev);
-        if (ret)
-                raid0_stop(mddev);
-        return ret;
 }
-static int raid0_stop(struct mddev *mddev)
+static int raid0_stop(mddev_t *mddev)
 {
-        struct r0conf *conf = mddev->private;
+        raid0_conf_t *conf = mddev->private;
        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
        kfree(conf->strip_zone);
@@ -487,10 +394,66 @@ static int raid0_stop(struct mddev *mddev)
        return 0;
 }
+/* Find the zone which holds a particular offset
+ * Update *sectorp to be an offset in that zone
+ */
+static struct strip_zone *find_zone(struct raid0_private_data *conf,
+                                    sector_t *sectorp)
+{
+        int i;
+        struct strip_zone *z = conf->strip_zone;
+        sector_t sector = *sectorp;
+        for (i = 0; i < conf->nr_strip_zones; i++)
+                if (sector < z[i].zone_end) {
+                        if (i)
+                                *sectorp = sector - z[i-1].zone_end;
+                        return z + i;
+                }
+        BUG();
+}
+/*
+ * remaps the bio to the target device. we separate two flows.
+ * power 2 flow and a general flow for the sake of perfromance
+*/
+static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
+                                sector_t sector, sector_t *sector_offset)
+{
+        unsigned int sect_in_chunk;
+        sector_t chunk;
+        raid0_conf_t *conf = mddev->private;
+        int raid_disks = conf->strip_zone[0].nb_dev;
+        unsigned int chunk_sects = mddev->chunk_sectors;
+        if (is_power_of_2(chunk_sects)) {
+                int chunksect_bits = ffz(~chunk_sects);
+                /* find the sector offset inside the chunk */
+                sect_in_chunk  = sector & (chunk_sects - 1);
+                sector >>= chunksect_bits;
+                /* chunk in zone */
+                chunk = *sector_offset;
+                /* quotient is the chunk in real device*/
+                sector_div(chunk, zone->nb_dev << chunksect_bits);
+        } else{
+                sect_in_chunk = sector_div(sector, chunk_sects);
+                chunk = *sector_offset;
+                sector_div(chunk, chunk_sects * zone->nb_dev);
+        }
+        /*
+        *  position the bio over the real device
+        *  real sector = chunk in device + starting of zone
+        *       + the position in the chunk
+        */
+        *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
+        return conf->devlist[(zone - conf->strip_zone)*raid_disks
+                             + sector_div(sector, zone->nb_dev)];
+}
 /*
 * Is io distribute over 1 or more chunks ?
 */
-static inline int is_io_in_chunk_boundary(struct mddev *mddev,
+static inline int is_io_in_chunk_boundary(mddev_t *mddev,
                        unsigned int chunk_sects, struct bio *bio)
 {
        if (likely(is_power_of_2(chunk_sects))) {
@@ -503,16 +466,16 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
        }
 }
-static void raid0_make_request(struct mddev *mddev, struct bio *bio)
+static int raid0_make_request(mddev_t *mddev, struct bio *bio)
 {
        unsigned int chunk_sects;
        sector_t sector_offset;
        struct strip_zone *zone;
-        struct md_rdev *tmp_dev;
+        mdk_rdev_t *tmp_dev;
        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
                md_flush_request(mddev, bio);
-                return;
+                return 0;
        }
        chunk_sects = mddev->chunk_sectors;
@@ -520,7 +483,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
                sector_t sector = bio->bi_sector;
                struct bio_pair *bp;
                /* Sanity check -- queue functions should prevent this happening */
-                if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
+                if (bio->bi_vcnt != 1 ||
                    bio->bi_idx != 0)
                        goto bad_map;
                /* This is a one page bio that upper layers
@@ -532,29 +495,26 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
                else
                        bp = bio_split(bio, chunk_sects -
                                       sector_div(sector, chunk_sects));
-                raid0_make_request(mddev, &bp->bio1);
+                if (raid0_make_request(mddev, &bp->bio1))
-                raid0_make_request(mddev, &bp->bio2);
+                        generic_make_request(&bp->bio1);
+                if (raid0_make_request(mddev, &bp->bio2))
+                        generic_make_request(&bp->bio2);
                bio_pair_release(bp);
-                return;
+                return 0;
        }
        sector_offset = bio->bi_sector;
-        zone = find_zone(mddev->private, &sector_offset);
+        zone =  find_zone(mddev->private, &sector_offset);
        tmp_dev = map_sector(mddev, zone, bio->bi_sector,
                             &sector_offset);
        bio->bi_bdev = tmp_dev->bdev;
        bio->bi_sector = sector_offset + zone->dev_start +
                tmp_dev->data_offset;
+        /*
-        if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+         * Let the main block layer submit the IO and resolve recursion:
-                     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+         */
-                /* Just ignore it */
+        return 1;
-                bio_endio(bio, 0);
-                return;
-        }
-        generic_make_request(bio);
-        return;
 bad_map:
        printk("md/raid0:%s: make_request bug: can't convert block across chunks"
@@ -563,19 +523,46 @@ bad_map:
               (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
        bio_io_error(bio);
-        return;
+        return 0;
 }
-static void raid0_status(struct seq_file *seq, struct mddev *mddev)
+static void raid0_status(struct seq_file *seq, mddev_t *mddev)
 {
+#undef MD_DEBUG
+#ifdef MD_DEBUG
+        int j, k, h;
+        char b[BDEVNAME_SIZE];
+        raid0_conf_t *conf = mddev->private;
+        int raid_disks = conf->strip_zone[0].nb_dev;
+        sector_t zone_size;
+        sector_t zone_start = 0;
+        h = 0;
+        for (j = 0; j < conf->nr_strip_zones; j++) {
+                seq_printf(seq, "      z%d", j);
+                seq_printf(seq, "=[");
+                for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+                        seq_printf(seq, "%s/", bdevname(
+                                conf->devlist[j*raid_disks + k]
+                                                ->bdev, b));
+                zone_size  = conf->strip_zone[j].zone_end - zone_start;
+                seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n",
+                        (unsigned long long)zone_start>>1,
+                        (unsigned long long)conf->strip_zone[j].dev_start>>1,
+                        (unsigned long long)zone_size>>1);
+                zone_start = conf->strip_zone[j].zone_end;
+        }
+#endif
        seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2);
        return;
 }
-static void *raid0_takeover_raid45(struct mddev *mddev)
+static void *raid0_takeover_raid45(mddev_t *mddev)
 {
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
-        struct r0conf *priv_conf;
+        raid0_conf_t *priv_conf;
        if (mddev->degraded != 1) {
                printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
@@ -584,7 +571,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
                return ERR_PTR(-EINVAL);
        }
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                /* check slot number for a disk */
                if (rdev->raid_disk == mddev->raid_disks-1) {
                        printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
@@ -606,9 +593,9 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
        return priv_conf;
 }
-static void *raid0_takeover_raid10(struct mddev *mddev)
+static void *raid0_takeover_raid10(mddev_t *mddev)
 {
-        struct r0conf *priv_conf;
+        raid0_conf_t *priv_conf;
        /* Check layout:
         *  - far_copies must be 1
@@ -647,10 +634,9 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
        return priv_conf;
 }
-static void *raid0_takeover_raid1(struct mddev *mddev)
+static void *raid0_takeover_raid1(mddev_t *mddev)
 {
-        struct r0conf *priv_conf;
+        raid0_conf_t *priv_conf;
-        int chunksect;
        /* Check layout:
         *  - (N - 1) mirror drives must be already faulty
@@ -661,25 +647,10 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
                return ERR_PTR(-EINVAL);
        }
-        /*
-         * a raid1 doesn't have the notion of chunk size, so
-         * figure out the largest suitable size we can use.
-         */
-        chunksect = 64 * 2; /* 64K by default */
-        /* The array must be an exact multiple of chunksize */
-        while (chunksect && (mddev->array_sectors & (chunksect - 1)))
-                chunksect >>= 1;
-        if ((chunksect << 9) < PAGE_SIZE)
-                /* array size does not allow a suitable chunk size */
-                return ERR_PTR(-EINVAL);
        /* Set new parameters */
        mddev->new_level = 0;
        mddev->new_layout = 0;
-        mddev->new_chunk_sectors = chunksect;
+        mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */
-        mddev->chunk_sectors = chunksect;
        mddev->delta_disks = 1 - mddev->raid_disks;
        mddev->raid_disks = 1;
        /* make sure it will be not marked as dirty */
@@ -689,7 +660,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
        return priv_conf;
 }
-static void *raid0_takeover(struct mddev *mddev)
+static void *raid0_takeover(mddev_t *mddev)
 {
        /* raid0 can take over:
         *  raid4 - if all data disks are active.
@@ -720,11 +691,11 @@ static void *raid0_takeover(struct mddev *mddev)
        return ERR_PTR(-EINVAL);
 }
-static void raid0_quiesce(struct mddev *mddev, int state)
+static void raid0_quiesce(mddev_t *mddev, int state)
 {
 }
-static struct md_personality raid0_personality=
+static struct mdk_personality raid0_personality=
 {
        .name           = "raid0",
        .level          = 0,
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 05539d9c97f..91f8e876ee6 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -1,19 +1,20 @@
 #ifndef _RAID0_H
 #define _RAID0_H
-struct strip_zone {
+struct strip_zone
+{
        sector_t zone_end;      /* Start of the next zone (in sectors) */
        sector_t dev_start;     /* Zone offset in real dev (in sectors) */
-        int      nb_dev;        /* # of devices attached to the zone */
+        int nb_dev;             /* # of devices attached to the zone */
 };
-struct r0conf {
+struct raid0_private_data
-        struct strip_zone       *strip_zone;
+{
-        struct md_rdev          **devlist; /* lists of rdevs, pointed to
+        struct strip_zone *strip_zone;
-                                            * by strip_zone->dev */
+        mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
-        int                     nr_strip_zones;
+        int nr_strip_zones;
-        int                     has_merge_bvec; /* at least one member has
-                                                 * a merge_bvec_fn */
 };
+typedef struct raid0_private_data raid0_conf_t;
 #endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d5bddfc4010..606fc04fd76 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -34,45 +34,28 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/blkdev.h>
-#include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
 #include "md.h"
 #include "raid1.h"
 #include "bitmap.h"
+#define DEBUG 0
+#define PRINTK(x...) do { if (DEBUG) printk(x); } while (0)
 /*
 * Number of guaranteed r1bios in case of extreme VM load:
 */
 #define NR_RAID1_BIOS 256
-/* when we get a read error on a read-only array, we redirect to another
- * device without failing the first device, or trying to over-write to
- * correct the read error.  To keep track of bad blocks on a per-bio
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
- */
-#define IO_BLOCKED ((struct bio *)1)
-/* When we successfully write to a known bad-block, we need to remove the
- * bad-block marking which must be done from process context.  So we record
- * the success by setting devs[n].bio to IO_MADE_GOOD
- */
-#define IO_MADE_GOOD ((struct bio *)2)
-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
-/* When there are this many requests queue to be written by
- * the raid1 thread, we become 'congested' to provide back-pressure
- * for writeback.
- */
-static int max_queued_requests = 1024;
-static void allow_barrier(struct r1conf *conf);
+static void allow_barrier(conf_t *conf);
-static void lower_barrier(struct r1conf *conf);
+static void lower_barrier(conf_t *conf);
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
        struct pool_info *pi = data;
-        int size = offsetof(struct r1bio, bios[pi->raid_disks]);
+        int size = offsetof(r1bio_t, bios[pi->raid_disks]);
        /* allocate a r1bio with room for raid_disks entries in the bios array */
        return kzalloc(size, gfp_flags);
@@ -93,7 +76,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
        struct pool_info *pi = data;
        struct page *page;
-        struct r1bio *r1_bio;
+        r1bio_t *r1_bio;
        struct bio *bio;
        int i, j;
@@ -149,7 +132,7 @@ out_free_pages:
                        put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
        j = -1;
 out_free_bio:
-        while (++j < pi->raid_disks)
+        while ( ++j < pi->raid_disks )
                bio_put(r1_bio->bios[j]);
        r1bio_pool_free(r1_bio, data);
        return NULL;
@@ -159,7 +142,7 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
 {
        struct pool_info *pi = data;
        int i,j;
-        struct r1bio *r1bio = __r1_bio;
+        r1bio_t *r1bio = __r1_bio;
        for (i = 0; i < RESYNC_PAGES; i++)
                for (j = pi->raid_disks; j-- ;) {
@@ -174,11 +157,11 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
        r1bio_pool_free(r1bio, data);
 }
-static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
+static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
 {
        int i;
-        for (i = 0; i < conf->raid_disks * 2; i++) {
+        for (i = 0; i < conf->raid_disks; i++) {
                struct bio **bio = r1_bio->bios + i;
                if (!BIO_SPECIAL(*bio))
                        bio_put(*bio);
@@ -186,20 +169,20 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
        }
 }
-static void free_r1bio(struct r1bio *r1_bio)
+static void free_r1bio(r1bio_t *r1_bio)
 {
-        struct r1conf *conf = r1_bio->mddev->private;
+        conf_t *conf = r1_bio->mddev->private;
        put_all_bios(conf, r1_bio);
        mempool_free(r1_bio, conf->r1bio_pool);
 }
-static void put_buf(struct r1bio *r1_bio)
+static void put_buf(r1bio_t *r1_bio)
 {
-        struct r1conf *conf = r1_bio->mddev->private;
+        conf_t *conf = r1_bio->mddev->private;
        int i;
-        for (i = 0; i < conf->raid_disks * 2; i++) {
+        for (i=0; i<conf->raid_disks; i++) {
                struct bio *bio = r1_bio->bios[i];
                if (bio->bi_end_io)
                        rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
@@ -210,11 +193,11 @@ static void put_buf(struct r1bio *r1_bio)
        lower_barrier(conf);
 }
-static void reschedule_retry(struct r1bio *r1_bio)
+static void reschedule_retry(r1bio_t *r1_bio)
 {
        unsigned long flags;
-        struct mddev *mddev = r1_bio->mddev;
+        mddev_t *mddev = r1_bio->mddev;
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&r1_bio->retry_list, &conf->retry_list);
@@ -230,11 +213,11 @@ static void reschedule_retry(struct r1bio *r1_bio)
 * operation and are ready to return a success/failure code to the buffer
 * cache layer.
 */
-static void call_bio_endio(struct r1bio *r1_bio)
+static void call_bio_endio(r1bio_t *r1_bio)
 {
        struct bio *bio = r1_bio->master_bio;
        int done;
-        struct r1conf *conf = r1_bio->mddev->private;
+        conf_t *conf = r1_bio->mddev->private;
        if (bio->bi_phys_segments) {
                unsigned long flags;
@@ -257,17 +240,17 @@ static void call_bio_endio(struct r1bio *r1_bio)
        }
 }
-static void raid_end_bio_io(struct r1bio *r1_bio)
+static void raid_end_bio_io(r1bio_t *r1_bio)
 {
        struct bio *bio = r1_bio->master_bio;
        /* if nobody has done the final endio yet, do it now */
        if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
-                pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
+                PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
-                         (bio_data_dir(bio) == WRITE) ? "write" : "read",
+                        (bio_data_dir(bio) == WRITE) ? "write" : "read",
-                         (unsigned long long) bio->bi_sector,
+                        (unsigned long long) bio->bi_sector,
-                         (unsigned long long) bio->bi_sector +
+                        (unsigned long long) bio->bi_sector +
-                         (bio->bi_size >> 9) - 1);
+                                (bio->bi_size >> 9) - 1);
                call_bio_endio(r1_bio);
        }
@@ -277,39 +260,20 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
 /*
 * Update disk head position estimator based on IRQ completion info.
 */
-static inline void update_head_pos(int disk, struct r1bio *r1_bio)
+static inline void update_head_pos(int disk, r1bio_t *r1_bio)
 {
-        struct r1conf *conf = r1_bio->mddev->private;
+        conf_t *conf = r1_bio->mddev->private;
        conf->mirrors[disk].head_position =
                r1_bio->sector + (r1_bio->sectors);
 }
-/*
- * Find the disk number which triggered given bio
- */
-static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
-{
-        int mirror;
-        struct r1conf *conf = r1_bio->mddev->private;
-        int raid_disks = conf->raid_disks;
-        for (mirror = 0; mirror < raid_disks * 2; mirror++)
-                if (r1_bio->bios[mirror] == bio)
-                        break;
-        BUG_ON(mirror == raid_disks * 2);
-        update_head_pos(mirror, r1_bio);
-        return mirror;
-}
 static void raid1_end_read_request(struct bio *bio, int error)
 {
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct r1bio *r1_bio = bio->bi_private;
+        r1bio_t *r1_bio = bio->bi_private;
        int mirror;
-        struct r1conf *conf = r1_bio->mddev->private;
+        conf_t *conf = r1_bio->mddev->private;
        mirror = r1_bio->read_disk;
        /*
@@ -333,10 +297,9 @@ static void raid1_end_read_request(struct bio *bio, int error)
                spin_unlock_irqrestore(&conf->device_lock, flags);
        }
-        if (uptodate) {
+        if (uptodate)
                raid_end_bio_io(r1_bio);
-                rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+        else {
-        } else {
                /*
                 * oops, read error:
                 */
@@ -350,11 +313,12 @@ static void raid1_end_read_request(struct bio *bio, int error)
                        (unsigned long long)r1_bio->sector);
                set_bit(R1BIO_ReadError, &r1_bio->state);
                reschedule_retry(r1_bio);
-                /* don't drop the reference on read_disk yet */
        }
+        rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
 }
-static void close_write(struct r1bio *r1_bio)
+static void close_write(r1bio_t *r1_bio)
 {
        /* it really is the end of this request */
        if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
@@ -373,7 +337,7 @@ static void close_write(struct r1bio *r1_bio)
        md_write_end(r1_bio->mddev);
 }
-static void r1_bio_write_done(struct r1bio *r1_bio)
+static void r1_bio_write_done(r1bio_t *r1_bio)
 {
        if (!atomic_dec_and_test(&r1_bio->remaining))
                return;
@@ -392,12 +356,15 @@ static void r1_bio_write_done(struct r1bio *r1_bio)
 static void raid1_end_write_request(struct bio *bio, int error)
 {
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct r1bio *r1_bio = bio->bi_private;
+        r1bio_t *r1_bio = bio->bi_private;
        int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
-        struct r1conf *conf = r1_bio->mddev->private;
+        conf_t *conf = r1_bio->mddev->private;
        struct bio *to_put = NULL;
-        mirror = find_bio_disk(r1_bio, bio);
+        for (mirror = 0; mirror < conf->raid_disks; mirror++)
+                if (r1_bio->bios[mirror] == bio)
+                        break;
        /*
         * 'one mirror IO has finished' event handler:
@@ -405,11 +372,6 @@ static void raid1_end_write_request(struct bio *bio, int error)
        if (!uptodate) {
                set_bit(WriteErrorSeen,
                        &conf->mirrors[mirror].rdev->flags);
-                if (!test_and_set_bit(WantReplacement,
-                                      &conf->mirrors[mirror].rdev->flags))
-                        set_bit(MD_RECOVERY_NEEDED, &
-                                conf->mddev->recovery);
                set_bit(R1BIO_WriteError, &r1_bio->state);
        } else {
                /*
@@ -438,6 +400,8 @@ static void raid1_end_write_request(struct bio *bio, int error)
                }
        }
+        update_head_pos(mirror, r1_bio);
        if (behind) {
                if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
                        atomic_dec(&r1_bio->behind_remaining);
@@ -454,11 +418,10 @@ static void raid1_end_write_request(struct bio *bio, int error)
                        /* Maybe we can return now */
                        if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
                                struct bio *mbio = r1_bio->master_bio;
-                                pr_debug("raid1: behind end write sectors"
+                                PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
-                                         " %llu-%llu\n",
+                                       (unsigned long long) mbio->bi_sector,
-                                         (unsigned long long) mbio->bi_sector,
+                                       (unsigned long long) mbio->bi_sector +
-                                         (unsigned long long) mbio->bi_sector +
+                                       (mbio->bi_size >> 9) - 1);
-                                         (mbio->bi_size >> 9) - 1);
                                call_bio_endio(r1_bio);
                        }
                }
@@ -492,19 +455,17 @@ static void raid1_end_write_request(struct bio *bio, int error)
 *
 * The rdev for the device selected will have nr_pending incremented.
 */
-static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
+static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors)
 {
        const sector_t this_sector = r1_bio->sector;
        int sectors;
        int best_good_sectors;
-        int best_disk, best_dist_disk, best_pending_disk;
+        int start_disk;
-        int has_nonrot_disk;
+        int best_disk;
-        int disk;
+        int i;
        sector_t best_dist;
-        unsigned int min_pending;
+        mdk_rdev_t *rdev;
-        struct md_rdev *rdev;
        int choose_first;
-        int choose_next_idle;
        rcu_read_lock();
        /*
@@ -515,31 +476,30 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 retry:
        sectors = r1_bio->sectors;
        best_disk = -1;
-        best_dist_disk = -1;
        best_dist = MaxSector;
-        best_pending_disk = -1;
-        min_pending = UINT_MAX;
        best_good_sectors = 0;
-        has_nonrot_disk = 0;
-        choose_next_idle = 0;
        if (conf->mddev->recovery_cp < MaxSector &&
-            (this_sector + sectors >= conf->next_resync))
+            (this_sector + sectors >= conf->next_resync)) {
                choose_first = 1;
-        else
+                start_disk = 0;
+        } else {
                choose_first = 0;
+                start_disk = conf->last_used;
+        }
-        for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
+        for (i = 0 ; i < conf->raid_disks ; i++) {
                sector_t dist;
                sector_t first_bad;
                int bad_sectors;
-                unsigned int pending;
-                bool nonrot;
+                int disk = start_disk + i;
+                if (disk >= conf->raid_disks)
+                        disk -= conf->raid_disks;
                rdev = rcu_dereference(conf->mirrors[disk].rdev);
                if (r1_bio->bios[disk] == IO_BLOCKED
                    || rdev == NULL
-                    || test_bit(Unmerged, &rdev->flags)
                    || test_bit(Faulty, &rdev->flags))
                        continue;
                if (!test_bit(In_sync, &rdev->flags) &&
@@ -593,77 +553,22 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                } else
                        best_good_sectors = sectors;
-                nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
-                has_nonrot_disk |= nonrot;
-                pending = atomic_read(&rdev->nr_pending);
                dist = abs(this_sector - conf->mirrors[disk].head_position);
-                if (choose_first) {
+                if (choose_first
-                        best_disk = disk;
+                    /* Don't change to another disk for sequential reads */
-                        break;
+                    || conf->next_seq_sect == this_sector
-                }
+                    || dist == 0
-                /* Don't change to another disk for sequential reads */
+                    /* If device is idle, use it */
-                if (conf->mirrors[disk].next_seq_sect == this_sector
+                    || atomic_read(&rdev->nr_pending) == 0) {
-                    || dist == 0) {
-                        int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
-                        struct raid1_info *mirror = &conf->mirrors[disk];
                        best_disk = disk;
-                        /*
-                         * If buffered sequential IO size exceeds optimal
-                         * iosize, check if there is idle disk. If yes, choose
-                         * the idle disk. read_balance could already choose an
-                         * idle disk before noticing it's a sequential IO in
-                         * this disk. This doesn't matter because this disk
-                         * will idle, next time it will be utilized after the
-                         * first disk has IO size exceeds optimal iosize. In
-                         * this way, iosize of the first disk will be optimal
-                         * iosize at least. iosize of the second disk might be
-                         * small, but not a big deal since when the second disk
-                         * starts IO, the first disk is likely still busy.
-                         */
-                        if (nonrot && opt_iosize > 0 &&
-                            mirror->seq_start != MaxSector &&
-                            mirror->next_seq_sect > opt_iosize &&
-                            mirror->next_seq_sect - opt_iosize >=
-                            mirror->seq_start) {
-                                choose_next_idle = 1;
-                                continue;
-                        }
                        break;
                }
-                /* If device is idle, use it */
-                if (pending == 0) {
-                        best_disk = disk;
-                        break;
-                }
-                if (choose_next_idle)
-                        continue;
-                if (min_pending > pending) {
-                        min_pending = pending;
-                        best_pending_disk = disk;
-                }
                if (dist < best_dist) {
                        best_dist = dist;
-                        best_dist_disk = disk;
+                        best_disk = disk;
                }
        }
-        /*
-         * If all disks are rotational, choose the closest disk. If any disk is
-         * non-rotational, choose the disk with less pending request even the
-         * disk is rotational, which might/might not be optimal for raids with
-         * mixed ratation/non-rotational disks depending on workload.
-         */
-        if (best_disk == -1) {
-                if (has_nonrot_disk)
-                        best_disk = best_pending_disk;
-                else
-                        best_disk = best_dist_disk;
-        }
        if (best_disk >= 0) {
                rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
                if (!rdev)
@@ -677,11 +582,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                        goto retry;
                }
                sectors = best_good_sectors;
+                conf->next_seq_sect = this_sector + sectors;
-                if (conf->mirrors[best_disk].next_seq_sect != this_sector)
+                conf->last_used = best_disk;
-                        conf->mirrors[best_disk].seq_start = this_sector;
-                conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
        }
        rcu_read_unlock();
        *max_sectors = sectors;
@@ -689,51 +591,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
        return best_disk;
 }
-static int raid1_mergeable_bvec(struct request_queue *q,
+int md_raid1_congested(mddev_t *mddev, int bits)
-                                struct bvec_merge_data *bvm,
-                                struct bio_vec *biovec)
-{
-        struct mddev *mddev = q->queuedata;
-        struct r1conf *conf = mddev->private;
-        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
-        int max = biovec->bv_len;
-        if (mddev->merge_check_needed) {
-                int disk;
-                rcu_read_lock();
-                for (disk = 0; disk < conf->raid_disks * 2; disk++) {
-                        struct md_rdev *rdev = rcu_dereference(
-                                conf->mirrors[disk].rdev);
-                        if (rdev && !test_bit(Faulty, &rdev->flags)) {
-                                struct request_queue *q =
-                                        bdev_get_queue(rdev->bdev);
-                                if (q->merge_bvec_fn) {
-                                        bvm->bi_sector = sector +
-                                                rdev->data_offset;
-                                        bvm->bi_bdev = rdev->bdev;
-                                        max = min(max, q->merge_bvec_fn(
-                                                          q, bvm, biovec));
-                                }
-                        }
-                }
-                rcu_read_unlock();
-        }
-        return max;
-}
-int md_raid1_congested(struct mddev *mddev, int bits)
 {
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int i, ret = 0;
-        if ((bits & (1 << BDI_async_congested)) &&
-            conf->pending_count >= max_queued_requests)
-                return 1;
        rcu_read_lock();
-        for (i = 0; i < conf->raid_disks * 2; i++) {
+        for (i = 0; i < mddev->raid_disks; i++) {
-                struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
                if (rdev && !test_bit(Faulty, &rdev->flags)) {
                        struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -755,13 +620,13 @@ EXPORT_SYMBOL_GPL(md_raid1_congested);
 static int raid1_congested(void *data, int bits)
 {
-        struct mddev *mddev = data;
+        mddev_t *mddev = data;
        return mddev_congested(mddev, bits) ||
                md_raid1_congested(mddev, bits);
 }
-static void flush_pending_writes(struct r1conf *conf)
+static void flush_pending_writes(conf_t *conf)
 {
        /* Any writes that have been queued but are awaiting
         * bitmap updates get flushed here.
@@ -771,22 +636,15 @@ static void flush_pending_writes(struct r1conf *conf)
        if (conf->pending_bio_list.head) {
                struct bio *bio;
                bio = bio_list_get(&conf->pending_bio_list);
-                conf->pending_count = 0;
                spin_unlock_irq(&conf->device_lock);
                /* flush any pending bitmap writes to
                 * disk before proceeding w/ I/O */
                bitmap_unplug(conf->mddev->bitmap);
-                wake_up(&conf->wait_barrier);
                while (bio) { /* submit pending writes */
                        struct bio *next = bio->bi_next;
                        bio->bi_next = NULL;
-                        if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+                        generic_make_request(bio);
-                            !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
-                                /* Just ignore it */
-                                bio_endio(bio, 0);
-                        else
-                                generic_make_request(bio);
                        bio = next;
                }
        } else
@@ -816,13 +674,13 @@ static void flush_pending_writes(struct r1conf *conf)
 */
 #define RESYNC_DEPTH 32
-static void raise_barrier(struct r1conf *conf)
+static void raise_barrier(conf_t *conf)
 {
        spin_lock_irq(&conf->resync_lock);
        /* Wait until no block IO is waiting */
        wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
-                            conf->resync_lock);
+                            conf->resync_lock, );
        /* block any new IO from starting */
        conf->barrier++;
@@ -830,12 +688,12 @@ static void raise_barrier(struct r1conf *conf)
        /* Now wait for all pending IO to complete */
        wait_event_lock_irq(conf->wait_barrier,
                            !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-                            conf->resync_lock);
+                            conf->resync_lock, );
        spin_unlock_irq(&conf->resync_lock);
 }
-static void lower_barrier(struct r1conf *conf)
+static void lower_barrier(conf_t *conf)
 {
        unsigned long flags;
        BUG_ON(conf->barrier <= 0);
@@ -845,33 +703,21 @@ static void lower_barrier(struct r1conf *conf)
        wake_up(&conf->wait_barrier);
 }
-static void wait_barrier(struct r1conf *conf)
+static void wait_barrier(conf_t *conf)
 {
        spin_lock_irq(&conf->resync_lock);
        if (conf->barrier) {
                conf->nr_waiting++;
-                /* Wait for the barrier to drop.
+                wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
-                 * However if there are already pending
+                                    conf->resync_lock,
-                 * requests (preventing the barrier from
+                                    );
-                 * rising completely), and the
-                 * pre-process bio queue isn't empty,
-                 * then don't wait, as we need to empty
-                 * that queue to get the nr_pending
-                 * count down.
-                 */
-                wait_event_lock_irq(conf->wait_barrier,
-                                    !conf->barrier ||
-                                    (conf->nr_pending &&
-                                     current->bio_list &&
-                                     !bio_list_empty(current->bio_list)),
-                                    conf->resync_lock);
                conf->nr_waiting--;
        }
        conf->nr_pending++;
        spin_unlock_irq(&conf->resync_lock);
 }
-static void allow_barrier(struct r1conf *conf)
+static void allow_barrier(conf_t *conf)
 {
        unsigned long flags;
        spin_lock_irqsave(&conf->resync_lock, flags);
@@ -880,7 +726,7 @@ static void allow_barrier(struct r1conf *conf)
        wake_up(&conf->wait_barrier);
 }
-static void freeze_array(struct r1conf *conf)
+static void freeze_array(conf_t *conf)
 {
        /* stop syncio and normal IO and wait for everything to
         * go quite.
@@ -897,13 +743,13 @@ static void freeze_array(struct r1conf *conf)
        spin_lock_irq(&conf->resync_lock);
        conf->barrier++;
        conf->nr_waiting++;
-        wait_event_lock_irq_cmd(conf->wait_barrier,
+        wait_event_lock_irq(conf->wait_barrier,
-                                conf->nr_pending == conf->nr_queued+1,
+                            conf->nr_pending == conf->nr_queued+1,
-                                conf->resync_lock,
+                            conf->resync_lock,
-                                flush_pending_writes(conf));
+                            flush_pending_writes(conf));
        spin_unlock_irq(&conf->resync_lock);
 }
-static void unfreeze_array(struct r1conf *conf)
+static void unfreeze_array(conf_t *conf)
 {
        /* reverse the effect of the freeze */
        spin_lock_irq(&conf->resync_lock);
@@ -916,7 +762,7 @@ static void unfreeze_array(struct r1conf *conf)
 /* duplicate the data pages for behind I/O 
 */
-static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
+static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
 {
        int i;
        struct bio_vec *bvec;
@@ -945,52 +791,14 @@ do_sync_io:
                if (bvecs[i].bv_page)
                        put_page(bvecs[i].bv_page);
        kfree(bvecs);
-        pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+        PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
-}
-struct raid1_plug_cb {
-        struct blk_plug_cb      cb;
-        struct bio_list         pending;
-        int                     pending_cnt;
-};
-static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
-{
-        struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
-                                                  cb);
-        struct mddev *mddev = plug->cb.data;
-        struct r1conf *conf = mddev->private;
-        struct bio *bio;
-        if (from_schedule || current->bio_list) {
-                spin_lock_irq(&conf->device_lock);
-                bio_list_merge(&conf->pending_bio_list, &plug->pending);
-                conf->pending_count += plug->pending_cnt;
-                spin_unlock_irq(&conf->device_lock);
-                md_wakeup_thread(mddev->thread);
-                kfree(plug);
-                return;
-        }
-        /* we aren't scheduling, so we can do the write-out directly. */
-        bio = bio_list_get(&plug->pending);
-        bitmap_unplug(mddev->bitmap);
-        wake_up(&conf->wait_barrier);
-        while (bio) { /* submit pending writes */
-                struct bio *next = bio->bi_next;
-                bio->bi_next = NULL;
-                generic_make_request(bio);
-                bio = next;
-        }
-        kfree(plug);
 }
-static void make_request(struct mddev *mddev, struct bio * bio)
+static int make_request(mddev_t *mddev, struct bio * bio)
 {
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
-        struct raid1_info *mirror;
+        mirror_info_t *mirror;
-        struct r1bio *r1_bio;
+        r1bio_t *r1_bio;
        struct bio *read_bio;
        int i, disks;
        struct bitmap *bitmap;
@@ -998,11 +806,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        const int rw = bio_data_dir(bio);
        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
        const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
-        const unsigned long do_discard = (bio->bi_rw
+        mdk_rdev_t *blocked_rdev;
-                                          & (REQ_DISCARD | REQ_SECURE));
+        int plugged;
-        struct md_rdev *blocked_rdev;
-        struct blk_plug_cb *cb;
-        struct raid1_plug_cb *plug = NULL;
        int first_clone;
        int sectors_handled;
        int max_sectors;
@@ -1074,7 +879,7 @@ read_again:
                if (rdisk < 0) {
                        /* couldn't find anywhere to read from */
                        raid_end_bio_io(r1_bio);
-                        return;
+                        return 0;
                }
                mirror = conf->mirrors + rdisk;
@@ -1132,17 +937,12 @@ read_again:
                        goto read_again;
                } else
                        generic_make_request(read_bio);
-                return;
+                return 0;
        }
        /*
         * WRITE:
         */
-        if (conf->pending_count >= max_queued_requests) {
-                md_wakeup_thread(mddev->thread);
-                wait_event(conf->wait_barrier,
-                           conf->pending_count < max_queued_requests);
-        }
        /* first select target devices under rcu_lock and
         * inc refcount on their rdev.  Record them by setting
         * bios[x] to bio
@@ -1153,24 +953,23 @@ read_again:
         * the bad blocks.  Each set of writes gets it's own r1bio
         * with a set of bios attached.
         */
+        plugged = mddev_check_plugged(mddev);
-        disks = conf->raid_disks * 2;
+        disks = conf->raid_disks;
 retry_write:
        blocked_rdev = NULL;
        rcu_read_lock();
        max_sectors = r1_bio->sectors;
        for (i = 0;  i < disks; i++) {
-                struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
                if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
                        atomic_inc(&rdev->nr_pending);
                        blocked_rdev = rdev;
                        break;
                }
                r1_bio->bios[i] = NULL;
-                if (!rdev || test_bit(Faulty, &rdev->flags)
+                if (!rdev || test_bit(Faulty, &rdev->flags)) {
-                    || test_bit(Unmerged, &rdev->flags)) {
+                        set_bit(R1BIO_Degraded, &r1_bio->state);
-                        if (i < conf->raid_disks)
-                                set_bit(R1BIO_Degraded, &r1_bio->state);
                        continue;
                }
@@ -1301,27 +1100,13 @@ read_again:
                                   conf->mirrors[i].rdev->data_offset);
                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
                mbio->bi_end_io = raid1_end_write_request;
-                mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard;
+                mbio->bi_rw = WRITE | do_flush_fua | do_sync;
                mbio->bi_private = r1_bio;
                atomic_inc(&r1_bio->remaining);
-                cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
-                if (cb)
-                        plug = container_of(cb, struct raid1_plug_cb, cb);
-                else
-                        plug = NULL;
                spin_lock_irqsave(&conf->device_lock, flags);
-                if (plug) {
+                bio_list_add(&conf->pending_bio_list, mbio);
-                        bio_list_add(&plug->pending, mbio);
-                        plug->pending_cnt++;
-                } else {
-                        bio_list_add(&conf->pending_bio_list, mbio);
-                        conf->pending_count++;
-                }
                spin_unlock_irqrestore(&conf->device_lock, flags);
-                if (!plug)
-                        md_wakeup_thread(mddev->thread);
        }
        /* Mustn't call r1_bio_write_done before this next test,
         * as it could result in the bio being freed.
@@ -1344,18 +1129,23 @@ read_again:
        /* In case raid1d snuck in to freeze_array */
        wake_up(&conf->wait_barrier);
+        if (do_sync || !bitmap || !plugged)
+                md_wakeup_thread(mddev->thread);
+        return 0;
 }
-static void status(struct seq_file *seq, struct mddev *mddev)
+static void status(struct seq_file *seq, mddev_t *mddev)
 {
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int i;
        seq_printf(seq, " [%d/%d] [", conf->raid_disks,
                   conf->raid_disks - mddev->degraded);
        rcu_read_lock();
        for (i = 0; i < conf->raid_disks; i++) {
-                struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
                seq_printf(seq, "%s",
                           rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
        }
@@ -1364,10 +1154,10 @@ static void status(struct seq_file *seq, struct mddev *mddev)
 }
-static void error(struct mddev *mddev, struct md_rdev *rdev)
+static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        char b[BDEVNAME_SIZE];
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        /*
         * If it is not operational, then we have already marked it as dead
@@ -1407,7 +1197,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
               mdname(mddev), conf->raid_disks - mddev->degraded);
 }
-static void print_conf(struct r1conf *conf)
+static void print_conf(conf_t *conf)
 {
        int i;
@@ -1422,7 +1212,7 @@ static void print_conf(struct r1conf *conf)
        rcu_read_lock();
        for (i = 0; i < conf->raid_disks; i++) {
                char b[BDEVNAME_SIZE];
-                struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
                if (rdev)
                        printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
                               i, !test_bit(In_sync, &rdev->flags),
@@ -1432,7 +1222,7 @@ static void print_conf(struct r1conf *conf)
        rcu_read_unlock();
 }
-static void close_sync(struct r1conf *conf)
+static void close_sync(conf_t *conf)
 {
        wait_barrier(conf);
        allow_barrier(conf);
@@ -1441,10 +1231,10 @@ static void close_sync(struct r1conf *conf)
        conf->r1buf_pool = NULL;
 }
-static int raid1_spare_active(struct mddev *mddev)
+static int raid1_spare_active(mddev_t *mddev)
 {
        int i;
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int count = 0;
        unsigned long flags;
@@ -1454,26 +1244,7 @@ static int raid1_spare_active(struct mddev *mddev)
         * Called under mddev lock, so rcu protection not needed.
         */
        for (i = 0; i < conf->raid_disks; i++) {
-                struct md_rdev *rdev = conf->mirrors[i].rdev;
+                mdk_rdev_t *rdev = conf->mirrors[i].rdev;
-                struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
-                if (repl
-                    && repl->recovery_offset == MaxSector
-                    && !test_bit(Faulty, &repl->flags)
-                    && !test_and_set_bit(In_sync, &repl->flags)) {
-                        /* replacement has just become active */
-                        if (!rdev ||
-                            !test_and_clear_bit(In_sync, &rdev->flags))
-                                count++;
-                        if (rdev) {
-                                /* Replaced device not technically
-                                 * faulty, but we need to be sure
-                                 * it gets removed and never re-added
-                                 */
-                                set_bit(Faulty, &rdev->flags);
-                                sysfs_notify_dirent_safe(
-                                        rdev->sysfs_state);
-                        }
-                }
                if (rdev
                    && !test_bit(Faulty, &rdev->flags)
                    && !test_and_set_bit(In_sync, &rdev->flags)) {
@@ -1490,15 +1261,14 @@ static int raid1_spare_active(struct mddev *mddev)
 }
-static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
+static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int err = -EEXIST;
        int mirror = 0;
-        struct raid1_info *p;
+        mirror_info_t *p;
        int first = 0;
-        int last = conf->raid_disks - 1;
+        int last = mddev->raid_disks - 1;
-        struct request_queue *q = bdev_get_queue(rdev->bdev);
        if (mddev->recovery_disabled == conf->recovery_disabled)
                return -EBUSY;
@@ -1506,17 +1276,22 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        if (rdev->raid_disk >= 0)
                first = last = rdev->raid_disk;
-        if (q->merge_bvec_fn) {
+        for (mirror = first; mirror <= last; mirror++)
-                set_bit(Unmerged, &rdev->flags);
+                if ( !(p=conf->mirrors+mirror)->rdev) {
-                mddev->merge_check_needed = 1;
-        }
-        for (mirror = first; mirror <= last; mirror++) {
-                p = conf->mirrors+mirror;
-                if (!p->rdev) {
                        disk_stack_limits(mddev->gendisk, rdev->bdev,
                                          rdev->data_offset << 9);
+                        /* as we don't honour merge_bvec_fn, we must
+                         * never risk violating it, so limit
+                         * ->max_segments to one lying with a single
+                         * page, as a one page request is never in
+                         * violation.
+                         */
+                        if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+                                blk_queue_max_segments(mddev->queue, 1);
+                                blk_queue_segment_boundary(mddev->queue,
+                                                           PAGE_CACHE_SIZE - 1);
+                        }
                        p->head_position = 0;
                        rdev->raid_disk = mirror;
@@ -1529,50 +1304,21 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                        rcu_assign_pointer(p->rdev, rdev);
                        break;
                }
-                if (test_bit(WantReplacement, &p->rdev->flags) &&
-                    p[conf->raid_disks].rdev == NULL) {
-                        /* Add this device as a replacement */
-                        clear_bit(In_sync, &rdev->flags);
-                        set_bit(Replacement, &rdev->flags);
-                        rdev->raid_disk = mirror;
-                        err = 0;
-                        conf->fullsync = 1;
-                        rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
-                        break;
-                }
-        }
-        if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
-                /* Some requests might not have seen this new
-                 * merge_bvec_fn.  We must wait for them to complete
-                 * before merging the device fully.
-                 * First we make sure any code which has tested
-                 * our function has submitted the request, then
-                 * we wait for all outstanding requests to complete.
-                 */
-                synchronize_sched();
-                raise_barrier(conf);
-                lower_barrier(conf);
-                clear_bit(Unmerged, &rdev->flags);
-        }
        md_integrity_add_rdev(rdev, mddev);
-        if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
-                queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
        print_conf(conf);
        return err;
 }
-static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
+static int raid1_remove_disk(mddev_t *mddev, int number)
 {
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int err = 0;
-        int number = rdev->raid_disk;
+        mdk_rdev_t *rdev;
-        struct raid1_info *p = conf->mirrors + number;
+        mirror_info_t *p = conf->mirrors+ number;
-        if (rdev != p->rdev)
-                p = conf->mirrors + conf->raid_disks + number;
        print_conf(conf);
-        if (rdev == p->rdev) {
+        rdev = p->rdev;
+        if (rdev) {
                if (test_bit(In_sync, &rdev->flags) ||
                    atomic_read(&rdev->nr_pending)) {
                        err = -EBUSY;
@@ -1594,21 +1340,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
                        err = -EBUSY;
                        p->rdev = rdev;
                        goto abort;
-                } else if (conf->mirrors[conf->raid_disks + number].rdev) {
+                }
-                        /* We just removed a device that is being replaced.
-                         * Move down the replacement.  We drain all IO before
-                         * doing this to avoid confusion.
-                         */
-                        struct md_rdev *repl =
-                                conf->mirrors[conf->raid_disks + number].rdev;
-                        raise_barrier(conf);
-                        clear_bit(Replacement, &repl->flags);
-                        p->rdev = repl;
-                        conf->mirrors[conf->raid_disks + number].rdev = NULL;
-                        lower_barrier(conf);
-                        clear_bit(WantReplacement, &rdev->flags);
-                } else
-                        clear_bit(WantReplacement, &rdev->flags);
                err = md_integrity_register(mddev);
        }
 abort:
@@ -1620,10 +1352,14 @@ abort:
 static void end_sync_read(struct bio *bio, int error)
 {
-        struct r1bio *r1_bio = bio->bi_private;
+        r1bio_t *r1_bio = bio->bi_private;
+        int i;
-        update_head_pos(r1_bio->read_disk, r1_bio);
+        for (i=r1_bio->mddev->raid_disks; i--; )
+                if (r1_bio->bios[i] == bio)
+                        break;
+        BUG_ON(i < 0);
+        update_head_pos(i, r1_bio);
        /*
         * we have read a block, now it needs to be re-written,
         * or re-read if the read failed.
@@ -1639,15 +1375,19 @@ static void end_sync_read(struct bio *bio, int error)
 static void end_sync_write(struct bio *bio, int error)
 {
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct r1bio *r1_bio = bio->bi_private;
+        r1bio_t *r1_bio = bio->bi_private;
-        struct mddev *mddev = r1_bio->mddev;
+        mddev_t *mddev = r1_bio->mddev;
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
+        int i;
        int mirror=0;
        sector_t first_bad;
        int bad_sectors;
-        mirror = find_bio_disk(r1_bio, bio);
+        for (i = 0; i < conf->raid_disks; i++)
+                if (r1_bio->bios[i] == bio) {
+                        mirror = i;
+                        break;
+                }
        if (!uptodate) {
                sector_t sync_blocks = 0;
                sector_t s = r1_bio->sector;
@@ -1661,10 +1401,6 @@ static void end_sync_write(struct bio *bio, int error)
                } while (sectors_to_go > 0);
                set_bit(WriteErrorSeen,
                        &conf->mirrors[mirror].rdev->flags);
-                if (!test_and_set_bit(WantReplacement,
-                                      &conf->mirrors[mirror].rdev->flags))
-                        set_bit(MD_RECOVERY_NEEDED, &
-                                mddev->recovery);
                set_bit(R1BIO_WriteError, &r1_bio->state);
        } else if (is_badblock(conf->mirrors[mirror].rdev,
                               r1_bio->sector,
@@ -1677,6 +1413,8 @@ static void end_sync_write(struct bio *bio, int error)
                )
                set_bit(R1BIO_MadeGood, &r1_bio->state);
+        update_head_pos(mirror, r1_bio);
        if (atomic_dec_and_test(&r1_bio->remaining)) {
                int s = r1_bio->sectors;
                if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
@@ -1689,26 +1427,21 @@ static void end_sync_write(struct bio *bio, int error)
        }
 }
-static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
+static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
                            int sectors, struct page *page, int rw)
 {
        if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
                /* success */
                return 1;
-        if (rw == WRITE) {
+        if (rw == WRITE)
                set_bit(WriteErrorSeen, &rdev->flags);
-                if (!test_and_set_bit(WantReplacement,
-                                      &rdev->flags))
-                        set_bit(MD_RECOVERY_NEEDED, &
-                                rdev->mddev->recovery);
-        }
        /* need to record an error - either for the block or the device */
        if (!rdev_set_badblocks(rdev, sector, sectors, 0))
                md_error(rdev->mddev, rdev);
        return 0;
 }
-static int fix_sync_read_error(struct r1bio *r1_bio)
+static int fix_sync_read_error(r1bio_t *r1_bio)
 {
        /* Try some synchronous reads of other devices to get
         * good data, much like with normal read errors.  Only
@@ -1721,8 +1454,8 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
         * made sure that anything with a bad block in range
         * will have bi_end_io clear.
         */
-        struct mddev *mddev = r1_bio->mddev;
+        mddev_t *mddev = r1_bio->mddev;
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        struct bio *bio = r1_bio->bios[r1_bio->read_disk];
        sector_t sect = r1_bio->sector;
        int sectors = r1_bio->sectors;
@@ -1732,7 +1465,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
                int s = sectors;
                int d = r1_bio->read_disk;
                int success = 0;
-                struct md_rdev *rdev;
+                mdk_rdev_t *rdev;
                int start;
                if (s > (PAGE_SIZE>>9))
@@ -1752,7 +1485,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
                                }
                        }
                        d++;
-                        if (d == conf->raid_disks * 2)
+                        if (d == conf->raid_disks)
                                d = 0;
                } while (!success && d != r1_bio->read_disk);
@@ -1769,7 +1502,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
                               mdname(mddev),
                               bdevname(bio->bi_bdev, b),
                               (unsigned long long)r1_bio->sector);
-                        for (d = 0; d < conf->raid_disks * 2; d++) {
+                        for (d = 0; d < conf->raid_disks; d++) {
                                rdev = conf->mirrors[d].rdev;
                                if (!rdev || test_bit(Faulty, &rdev->flags))
                                        continue;
@@ -1777,8 +1510,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
                                        abort = 1;
                        }
                        if (abort) {
-                                conf->recovery_disabled =
+                                mddev->recovery_disabled = 1;
-                                        mddev->recovery_disabled;
                                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
                                md_done_sync(mddev, r1_bio->sectors, 0);
                                put_buf(r1_bio);
@@ -1795,7 +1527,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
                /* write it back and re-read */
                while (d != r1_bio->read_disk) {
                        if (d == 0)
-                                d = conf->raid_disks * 2;
+                                d = conf->raid_disks;
                        d--;
                        if (r1_bio->bios[d]->bi_end_io != end_sync_read)
                                continue;
@@ -1810,7 +1542,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
                d = start;
                while (d != r1_bio->read_disk) {
                        if (d == 0)
-                                d = conf->raid_disks * 2;
+                                d = conf->raid_disks;
                        d--;
                        if (r1_bio->bios[d]->bi_end_io != end_sync_read)
                                continue;
@@ -1829,7 +1561,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
        return 1;
 }
-static int process_checks(struct r1bio *r1_bio)
+static int process_checks(r1bio_t *r1_bio)
 {
        /* We have read all readable devices.  If we haven't
         * got the block, then there is no hope left.
@@ -1838,13 +1570,12 @@ static int process_checks(struct r1bio *r1_bio)
         * If any blocks failed to read, then we need to
         * attempt an over-write
         */
-        struct mddev *mddev = r1_bio->mddev;
+        mddev_t *mddev = r1_bio->mddev;
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int primary;
        int i;
-        int vcnt;
-        for (primary = 0; primary < conf->raid_disks * 2; primary++)
+        for (primary = 0; primary < conf->raid_disks; primary++)
                if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
                    test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
                        r1_bio->bios[primary]->bi_end_io = NULL;
@@ -1852,9 +1583,9 @@ static int process_checks(struct r1bio *r1_bio)
                        break;
                }
        r1_bio->read_disk = primary;
-        vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
+        for (i = 0; i < conf->raid_disks; i++) {
-        for (i = 0; i < conf->raid_disks * 2; i++) {
                int j;
+                int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
                struct bio *pbio = r1_bio->bios[primary];
                struct bio *sbio = r1_bio->bios[i];
                int size;
@@ -1869,13 +1600,13 @@ static int process_checks(struct r1bio *r1_bio)
                                s = sbio->bi_io_vec[j].bv_page;
                                if (memcmp(page_address(p),
                                           page_address(s),
-                                           sbio->bi_io_vec[j].bv_len))
+                                           PAGE_SIZE))
                                        break;
                        }
                } else
                        j = 0;
                if (j >= 0)
-                        atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
+                        mddev->resync_mismatches += r1_bio->sectors;
                if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
                              && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
                        /* No need to write to this device. */
@@ -1912,11 +1643,11 @@ static int process_checks(struct r1bio *r1_bio)
        return 0;
 }
-static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
+static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 {
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int i;
-        int disks = conf->raid_disks * 2;
+        int disks = conf->raid_disks;
        struct bio *bio, *wbio;
        bio = r1_bio->bios[r1_bio->read_disk];
@@ -1951,14 +1682,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
        if (atomic_dec_and_test(&r1_bio->remaining)) {
                /* if we're here, all write(s) have completed, so clean up */
-                int s = r1_bio->sectors;
+                md_done_sync(mddev, r1_bio->sectors, 1);
-                if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+                put_buf(r1_bio);
-                    test_bit(R1BIO_WriteError, &r1_bio->state))
-                        reschedule_retry(r1_bio);
-                else {
-                        put_buf(r1_bio);
-                        md_done_sync(mddev, s, 1);
-                }
        }
 }
@@ -1970,16 +1695,16 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 *      3.      Performs writes following reads for array synchronising.
 */
-static void fix_read_error(struct r1conf *conf, int read_disk,
+static void fix_read_error(conf_t *conf, int read_disk,
                           sector_t sect, int sectors)
 {
-        struct mddev *mddev = conf->mddev;
+        mddev_t *mddev = conf->mddev;
        while(sectors) {
                int s = sectors;
                int d = read_disk;
                int success = 0;
                int start;
-                struct md_rdev *rdev;
+                mdk_rdev_t *rdev;
                if (s > (PAGE_SIZE>>9))
                        s = PAGE_SIZE >> 9;
@@ -1995,9 +1720,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
-                            (test_bit(In_sync, &rdev->flags) ||
+                            test_bit(In_sync, &rdev->flags) &&
-                             (!test_bit(Faulty, &rdev->flags) &&
-                              rdev->recovery_offset >= sect + s)) &&
                            is_badblock(rdev, sect, s,
                                        &first_bad, &bad_sectors) == 0 &&
                            sync_page_io(rdev, sect, s<<9,
@@ -2005,14 +1728,14 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
                                success = 1;
                        else {
                                d++;
-                                if (d == conf->raid_disks * 2)
+                                if (d == conf->raid_disks)
                                        d = 0;
                        }
                } while (!success && d != read_disk);
                if (!success) {
                        /* Cannot read from anywhere - mark it bad */
-                        struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
+                        mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev;
                        if (!rdev_set_badblocks(rdev, sect, s, 0))
                                md_error(mddev, rdev);
                        break;
@@ -2021,7 +1744,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
                start = d;
                while (d != read_disk) {
                        if (d==0)
-                                d = conf->raid_disks * 2;
+                                d = conf->raid_disks;
                        d--;
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
@@ -2033,7 +1756,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
                while (d != read_disk) {
                        char b[BDEVNAME_SIZE];
                        if (d==0)
-                                d = conf->raid_disks * 2;
+                                d = conf->raid_disks;
                        d--;
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
@@ -2075,11 +1798,11 @@ static int submit_bio_wait(int rw, struct bio *bio)
        return test_bit(BIO_UPTODATE, &bio->bi_flags);
 }
-static int narrow_write_error(struct r1bio *r1_bio, int i)
+static int narrow_write_error(r1bio_t *r1_bio, int i)
 {
-        struct mddev *mddev = r1_bio->mddev;
+        mddev_t *mddev = r1_bio->mddev;
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
-        struct md_rdev *rdev = conf->mirrors[i].rdev;
+        mdk_rdev_t *rdev = conf->mirrors[i].rdev;
        int vcnt, idx;
        struct bio_vec *vec;
@@ -2151,18 +1874,18 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
        return ok;
 }
-static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
+static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio)
 {
        int m;
        int s = r1_bio->sectors;
-        for (m = 0; m < conf->raid_disks * 2 ; m++) {
+        for (m = 0; m < conf->raid_disks ; m++) {
-                struct md_rdev *rdev = conf->mirrors[m].rdev;
+                mdk_rdev_t *rdev = conf->mirrors[m].rdev;
                struct bio *bio = r1_bio->bios[m];
                if (bio->bi_end_io == NULL)
                        continue;
                if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
                    test_bit(R1BIO_MadeGood, &r1_bio->state)) {
-                        rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
+                        rdev_clear_badblocks(rdev, r1_bio->sector, s);
                }
                if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
                    test_bit(R1BIO_WriteError, &r1_bio->state)) {
@@ -2174,15 +1897,15 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
        md_done_sync(conf->mddev, s, 1);
 }
-static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
+static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio)
 {
        int m;
-        for (m = 0; m < conf->raid_disks * 2 ; m++)
+        for (m = 0; m < conf->raid_disks ; m++)
                if (r1_bio->bios[m] == IO_MADE_GOOD) {
-                        struct md_rdev *rdev = conf->mirrors[m].rdev;
+                        mdk_rdev_t *rdev = conf->mirrors[m].rdev;
                        rdev_clear_badblocks(rdev,
                                             r1_bio->sector,
-                                             r1_bio->sectors, 0);
+                                             r1_bio->sectors);
                        rdev_dec_pending(rdev, conf->mddev);
                } else if (r1_bio->bios[m] != NULL) {
                        /* This drive got a write error.  We need to
@@ -2203,14 +1926,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
        raid_end_bio_io(r1_bio);
 }
-static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
+static void handle_read_error(conf_t *conf, r1bio_t *r1_bio)
 {
        int disk;
        int max_sectors;
-        struct mddev *mddev = conf->mddev;
+        mddev_t *mddev = conf->mddev;
        struct bio *bio;
        char b[BDEVNAME_SIZE];
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        clear_bit(R1BIO_ReadError, &r1_bio->state);
        /* we got a read error. Maybe the drive is bad.  Maybe just
@@ -2228,7 +1951,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
                unfreeze_array(conf);
        } else
                md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
-        rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
        bio = r1_bio->bios[r1_bio->read_disk];
        bdevname(bio->bi_bdev, b);
@@ -2294,12 +2016,11 @@ read_more:
        }
 }
-static void raid1d(struct md_thread *thread)
+static void raid1d(mddev_t *mddev)
 {
-        struct mddev *mddev = thread->mddev;
+        r1bio_t *r1_bio;
-        struct r1bio *r1_bio;
        unsigned long flags;
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
        struct blk_plug plug;
@@ -2308,14 +2029,15 @@ static void raid1d(struct md_thread *thread)
        blk_start_plug(&plug);
        for (;;) {
-                flush_pending_writes(conf);
+                if (atomic_read(&mddev->plug_cnt) == 0)
+                        flush_pending_writes(conf);
                spin_lock_irqsave(&conf->device_lock, flags);
                if (list_empty(head)) {
                        spin_unlock_irqrestore(&conf->device_lock, flags);
                        break;
                }
-                r1_bio = list_entry(head->prev, struct r1bio, retry_list);
+                r1_bio = list_entry(head->prev, r1bio_t, retry_list);
                list_del(head->prev);
                conf->nr_queued--;
                spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -2347,7 +2069,7 @@ static void raid1d(struct md_thread *thread)
 }
-static int init_resync(struct r1conf *conf)
+static int init_resync(conf_t *conf)
 {
        int buffs;
@@ -2371,10 +2093,10 @@ static int init_resync(struct r1conf *conf)
 * that can be installed to exclude normal IO requests.
 */
-static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
-        struct r1bio *r1_bio;
+        r1bio_t *r1_bio;
        struct bio *bio;
        sector_t max_sector, nr_sectors;
        int disk = -1;
@@ -2453,14 +2175,15 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
        r1_bio->state = 0;
        set_bit(R1BIO_IsSync, &r1_bio->state);
-        for (i = 0; i < conf->raid_disks * 2; i++) {
+        for (i=0; i < conf->raid_disks; i++) {
-                struct md_rdev *rdev;
+                mdk_rdev_t *rdev;
                bio = r1_bio->bios[i];
                /* take from bio_init */
                bio->bi_next = NULL;
                bio->bi_flags &= ~(BIO_POOL_MASK-1);
                bio->bi_flags |= 1 << BIO_UPTODATE;
+                bio->bi_comp_cpu = -1;
                bio->bi_rw = READ;
                bio->bi_vcnt = 0;
                bio->bi_idx = 0;
@@ -2472,8 +2195,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                rdev = rcu_dereference(conf->mirrors[i].rdev);
                if (rdev == NULL ||
                    test_bit(Faulty, &rdev->flags)) {
-                        if (i < conf->raid_disks)
+                        still_degraded = 1;
-                                still_degraded = 1;
                } else if (!test_bit(In_sync, &rdev->flags)) {
                        bio->bi_rw = WRITE;
                        bio->bi_end_io = end_sync_write;
@@ -2505,18 +2227,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                                bio->bi_rw = READ;
                                bio->bi_end_io = end_sync_read;
                                read_targets++;
-                        } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
-                                test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
-                                !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
-                                /*
-                                 * The device is suitable for reading (InSync),
-                                 * but has bad block(s) here. Let's try to correct them,
-                                 * if we are doing resync or repair. Otherwise, leave
-                                 * this device alone for this sync request.
-                                 */
-                                bio->bi_rw = WRITE;
-                                bio->bi_end_io = end_sync_write;
-                                write_targets++;
                        }
                }
                if (bio->bi_end_io) {
@@ -2536,9 +2246,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                 * need to mark them bad on all write targets
                 */
                int ok = 1;
-                for (i = 0 ; i < conf->raid_disks * 2 ; i++)
+                for (i = 0 ; i < conf->raid_disks ; i++)
                        if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
-                                struct md_rdev *rdev = conf->mirrors[i].rdev;
+                                mdk_rdev_t *rdev =
+                                        rcu_dereference(conf->mirrors[i].rdev);
                                ok = rdev_set_badblocks(rdev, sector_nr,
                                                        min_bad, 0
                                        ) && ok;
@@ -2574,10 +2285,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                /* There is nowhere to write, so all non-sync
                 * drives must be failed - so we are finished
                 */
-                sector_t rv;
+                sector_t rv = max_sector - sector_nr;
-                if (min_bad > 0)
-                        max_sector = sector_nr + min_bad;
-                rv = max_sector - sector_nr;
                *skipped = 1;
                put_buf(r1_bio);
                return rv;
@@ -2607,7 +2315,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                                len = sync_blocks<<9;
                }
-                for (i = 0 ; i < conf->raid_disks * 2; i++) {
+                for (i=0 ; i < conf->raid_disks; i++) {
                        bio = r1_bio->bios[i];
                        if (bio->bi_end_io) {
                                page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
@@ -2640,10 +2348,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
         */
        if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
                atomic_set(&r1_bio->remaining, read_targets);
-                for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
+                for (i=0; i<conf->raid_disks; i++) {
                        bio = r1_bio->bios[i];
                        if (bio->bi_end_io == end_sync_read) {
-                                read_targets--;
                                md_sync_acct(bio->bi_bdev, nr_sectors);
                                generic_make_request(bio);
                        }
@@ -2658,7 +2365,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
        return nr_sectors;
 }
-static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
        if (sectors)
                return sectors;
@@ -2666,20 +2373,19 @@ static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks
        return mddev->dev_sectors;
 }
-static struct r1conf *setup_conf(struct mddev *mddev)
+static conf_t *setup_conf(mddev_t *mddev)
 {
-        struct r1conf *conf;
+        conf_t *conf;
        int i;
-        struct raid1_info *disk;
+        mirror_info_t *disk;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        int err = -ENOMEM;
-        conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
+        conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
        if (!conf)
                goto abort;
-        conf->mirrors = kzalloc(sizeof(struct raid1_info)
+        conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
-                                * mddev->raid_disks * 2,
                                 GFP_KERNEL);
        if (!conf->mirrors)
                goto abort;
@@ -2691,7 +2397,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
        if (!conf->poolinfo)
                goto abort;
-        conf->poolinfo->raid_disks = mddev->raid_disks * 2;
+        conf->poolinfo->raid_disks = mddev->raid_disks;
        conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
                                          r1bio_pool_free,
                                          conf->poolinfo);
@@ -2700,28 +2406,17 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        conf->poolinfo->mddev = mddev;
-        err = -EINVAL;
        spin_lock_init(&conf->device_lock);
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
-                struct request_queue *q;
                int disk_idx = rdev->raid_disk;
                if (disk_idx >= mddev->raid_disks
                    || disk_idx < 0)
                        continue;
-                if (test_bit(Replacement, &rdev->flags))
+                disk = conf->mirrors + disk_idx;
-                        disk = conf->mirrors + mddev->raid_disks + disk_idx;
-                else
-                        disk = conf->mirrors + disk_idx;
-                if (disk->rdev)
-                        goto abort;
                disk->rdev = rdev;
-                q = bdev_get_queue(rdev->bdev);
-                if (q->merge_bvec_fn)
-                        mddev->merge_check_needed = 1;
                disk->head_position = 0;
-                disk->seq_start = MaxSector;
        }
        conf->raid_disks = mddev->raid_disks;
        conf->mddev = mddev;
@@ -2731,40 +2426,33 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        init_waitqueue_head(&conf->wait_barrier);
        bio_list_init(&conf->pending_bio_list);
-        conf->pending_count = 0;
-        conf->recovery_disabled = mddev->recovery_disabled - 1;
-        err = -EIO;
+        conf->last_used = -1;
-        for (i = 0; i < conf->raid_disks * 2; i++) {
+        for (i = 0; i < conf->raid_disks; i++) {
                disk = conf->mirrors + i;
-                if (i < conf->raid_disks &&
-                    disk[conf->raid_disks].rdev) {
-                        /* This slot has a replacement. */
-                        if (!disk->rdev) {
-                                /* No original, just make the replacement
-                                 * a recovering spare
-                                 */
-                                disk->rdev =
-                                        disk[conf->raid_disks].rdev;
-                                disk[conf->raid_disks].rdev = NULL;
-                        } else if (!test_bit(In_sync, &disk->rdev->flags))
-                                /* Original is not in_sync - bad */
-                                goto abort;
-                }
                if (!disk->rdev ||
                    !test_bit(In_sync, &disk->rdev->flags)) {
                        disk->head_position = 0;
-                        if (disk->rdev &&
+                        if (disk->rdev)
-                            (disk->rdev->saved_raid_disk < 0))
                                conf->fullsync = 1;
-                }
+                } else if (conf->last_used < 0)
+                        /*
+                         * The first working device is used as a
+                         * starting point to read balancing.
+                         */
+                        conf->last_used = i;
        }
+        err = -EIO;
+        if (conf->last_used < 0) {
+                printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
+                       mdname(mddev));
+                goto abort;
+        }
        err = -ENOMEM;
-        conf->thread = md_register_thread(raid1d, mddev, "raid1");
+        conf->thread = md_register_thread(raid1d, mddev, NULL);
        if (!conf->thread) {
                printk(KERN_ERR
                       "md/raid1:%s: couldn't allocate thread\n",
@@ -2786,14 +2474,11 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        return ERR_PTR(err);
 }
-static int stop(struct mddev *mddev);
+static int run(mddev_t *mddev)
-static int run(struct mddev *mddev)
 {
-        struct r1conf *conf;
+        conf_t *conf;
        int i;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
-        int ret;
-        bool discard_supported = false;
        if (mddev->level != 1) {
                printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
@@ -2818,13 +2503,20 @@ static int run(struct mddev *mddev)
        if (IS_ERR(conf))
                return PTR_ERR(conf);
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (!mddev->gendisk)
                        continue;
                disk_stack_limits(mddev->gendisk, rdev->bdev,
                                  rdev->data_offset << 9);
-                if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+                /* as we don't honour merge_bvec_fn, we must never risk
-                        discard_supported = true;
+                 * violating it, so limit ->max_segments to 1 lying within
+                 * a single page, as a one page request is never in violation.
+                 */
+                if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+                        blk_queue_max_segments(mddev->queue, 1);
+                        blk_queue_segment_boundary(mddev->queue,
+                                                   PAGE_CACHE_SIZE - 1);
+                }
        }
        mddev->degraded = 0;
@@ -2858,25 +2550,13 @@ static int run(struct mddev *mddev)
        if (mddev->queue) {
                mddev->queue->backing_dev_info.congested_fn = raid1_congested;
                mddev->queue->backing_dev_info.congested_data = mddev;
-                blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
-                if (discard_supported)
-                        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
-                                                mddev->queue);
-                else
-                        queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
-                                                  mddev->queue);
        }
+        return md_integrity_register(mddev);
-        ret =  md_integrity_register(mddev);
-        if (ret)
-                stop(mddev);
-        return ret;
 }
-static int stop(struct mddev *mddev)
+static int stop(mddev_t *mddev)
 {
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        struct bitmap *bitmap = mddev->bitmap;
        /* wait for behind writes to complete */
@@ -2901,7 +2581,7 @@ static int stop(struct mddev *mddev)
        return 0;
 }
-static int raid1_resize(struct mddev *mddev, sector_t sectors)
+static int raid1_resize(mddev_t *mddev, sector_t sectors)
 {
        /* no resync is happening, and there is enough space
         * on all devices, so we can resize.
@@ -2910,16 +2590,9 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
         * any io in the removed space completes, but it hardly seems
         * worth it.
         */
-        sector_t newsize = raid1_size(mddev, sectors, 0);
+        md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
-        if (mddev->external_size &&
+        if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
-            mddev->array_sectors > newsize)
                return -EINVAL;
-        if (mddev->bitmap) {
-                int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0);
-                if (ret)
-                        return ret;
-        }
-        md_set_array_sectors(mddev, newsize);
        set_capacity(mddev->gendisk, mddev->array_sectors);
        revalidate_disk(mddev->gendisk);
        if (sectors > mddev->dev_sectors &&
@@ -2932,7 +2605,7 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
        return 0;
 }
-static int raid1_reshape(struct mddev *mddev)
+static int raid1_reshape(mddev_t *mddev)
 {
        /* We need to:
         * 1/ resize the r1bio_pool
@@ -2947,8 +2620,8 @@ static int raid1_reshape(struct mddev *mddev)
         */
        mempool_t *newpool, *oldpool;
        struct pool_info *newpoolinfo;
-        struct raid1_info *newmirrors;
+        mirror_info_t *newmirrors;
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int cnt, raid_disks;
        unsigned long flags;
        int d, d2, err;
@@ -2982,7 +2655,7 @@ static int raid1_reshape(struct mddev *mddev)
        if (!newpoolinfo)
                return -ENOMEM;
        newpoolinfo->mddev = mddev;
-        newpoolinfo->raid_disks = raid_disks * 2;
+        newpoolinfo->raid_disks = raid_disks;
        newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
                                 r1bio_pool_free, newpoolinfo);
@@ -2990,8 +2663,7 @@ static int raid1_reshape(struct mddev *mddev)
                kfree(newpoolinfo);
                return -ENOMEM;
        }
-        newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
+        newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
-                             GFP_KERNEL);
        if (!newmirrors) {
                kfree(newpoolinfo);
                mempool_destroy(newpool);
@@ -3005,7 +2677,7 @@ static int raid1_reshape(struct mddev *mddev)
        conf->r1bio_pool = newpool;
        for (d = d2 = 0; d < conf->raid_disks; d++) {
-                struct md_rdev *rdev = conf->mirrors[d].rdev;
+                mdk_rdev_t *rdev = conf->mirrors[d].rdev;
                if (rdev && rdev->raid_disk != d2) {
                        sysfs_unlink_rdev(mddev, rdev);
                        rdev->raid_disk = d2;
@@ -3029,6 +2701,7 @@ static int raid1_reshape(struct mddev *mddev)
        conf->raid_disks = mddev->raid_disks = raid_disks;
        mddev->delta_disks = 0;
+        conf->last_used = 0; /* just make sure it is in-range */
        lower_barrier(conf);
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -3038,9 +2711,9 @@ static int raid1_reshape(struct mddev *mddev)
        return 0;
 }
-static void raid1_quiesce(struct mddev *mddev, int state)
+static void raid1_quiesce(mddev_t *mddev, int state)
 {
-        struct r1conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        switch(state) {
        case 2: /* wake for suspend */
@@ -3055,13 +2728,13 @@ static void raid1_quiesce(struct mddev *mddev, int state)
        }
 }
-static void *raid1_takeover(struct mddev *mddev)
+static void *raid1_takeover(mddev_t *mddev)
 {
        /* raid1 can take over:
         *  raid5 with 2 devices, any layout or chunk size
         */
        if (mddev->level == 5 && mddev->raid_disks == 2) {
-                struct r1conf *conf;
+                conf_t *conf;
                mddev->new_level = 1;
                mddev->new_layout = 0;
                mddev->new_chunk_sectors = 0;
@@ -3073,7 +2746,7 @@ static void *raid1_takeover(struct mddev *mddev)
        return ERR_PTR(-EINVAL);
 }
-static struct md_personality raid1_personality =
+static struct mdk_personality raid1_personality =
 {
        .name           = "raid1",
        .level          = 1,
@@ -3111,5 +2784,3 @@ MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
 MODULE_ALIAS("md-personality-3"); /* RAID1 */
 MODULE_ALIAS("md-raid1");
 MODULE_ALIAS("md-level-1");
-module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 0ff3715fb7e..e0d676b4897 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,15 +1,11 @@
 #ifndef _RAID1_H
 #define _RAID1_H
-struct raid1_info {
+typedef struct mirror_info mirror_info_t;
-        struct md_rdev  *rdev;
-        sector_t        head_position;
-        /* When choose the best device for a read (read_balance())
+struct mirror_info {
-         * we try to keep sequential reads one the same device
+        mdk_rdev_t      *rdev;
-         */
+        sector_t        head_position;
-        sector_t        next_seq_sect;
-        sector_t        seq_start;
 };
 /*
@@ -18,84 +14,64 @@ struct raid1_info {
 * pool was allocated for, so they know how much to allocate and free.
 * mddev->raid_disks cannot be used, as it can change while a pool is active
 * These two datums are stored in a kmalloced struct.
- * The 'raid_disks' here is twice the raid_disks in r1conf.
- * This allows space for each 'real' device can have a replacement in the
- * second half of the array.
 */
 struct pool_info {
-        struct mddev *mddev;
+        mddev_t *mddev;
        int     raid_disks;
 };
-struct r1conf {
-        struct mddev            *mddev;
-        struct raid1_info       *mirrors;       /* twice 'raid_disks' to
-                                                 * allow for replacements.
-                                                 */
-        int                     raid_disks;
-        /* During resync, read_balancing is only allowed on the part
+typedef struct r1bio_s r1bio_t;
-         * of the array that has been resynced.  'next_resync' tells us
-         * where that is.
-         */
-        sector_t                next_resync;
+struct r1_private_data_s {
+        mddev_t                 *mddev;
+        mirror_info_t           *mirrors;
+        int                     raid_disks;
+        int                     last_used;
+        sector_t                next_seq_sect;
        spinlock_t              device_lock;
-        /* list of 'struct r1bio' that need to be processed by raid1d,
-         * whether to retry a read, writeout a resync or recovery
-         * block, or anything else.
-         */
        struct list_head        retry_list;
+        /* queue pending writes and submit them on unplug */
-        /* queue pending writes to be submitted on unplug */
        struct bio_list         pending_bio_list;
-        int                     pending_count;
-        /* for use when syncing mirrors:
+        /* for use when syncing mirrors: */
-         * We don't allow both normal IO and resync/recovery IO at
-         * the same time - resync/recovery can only happen when there
-         * is no other IO.  So when either is active, the other has to wait.
-         * See more details description in raid1.c near raise_barrier().
-         */
-        wait_queue_head_t       wait_barrier;
        spinlock_t              resync_lock;
        int                     nr_pending;
        int                     nr_waiting;
        int                     nr_queued;
        int                     barrier;
+        sector_t                next_resync;
+        int                     fullsync;  /* set to 1 if a full sync is needed,
+                                            * (fresh device added).
+                                            * Cleared when a sync completes.
+                                            */
+        int                     recovery_disabled; /* when the same as
+                                                    * mddev->recovery_disabled
+                                                    * we don't allow recovery
+                                                    * to be attempted as we
+                                                    * expect a read error
+                                                    */
-        /* Set to 1 if a full sync is needed, (fresh device added).
+        wait_queue_head_t       wait_barrier;
-         * Cleared when a sync completes.
-         */
-        int                     fullsync;
-        /* When the same as mddev->recovery_disabled we don't allow
-         * recovery to be attempted as we expect a read error.
-         */
-        int                     recovery_disabled;
-        /* poolinfo contains information about the content of the
-         * mempools - it changes when the array grows or shrinks
-         */
        struct pool_info        *poolinfo;
-        mempool_t               *r1bio_pool;
-        mempool_t               *r1buf_pool;
-        /* temporary buffer to synchronous IO when attempting to repair
-         * a read error.
-         */
        struct page             *tmppage;
+        mempool_t *r1bio_pool;
+        mempool_t *r1buf_pool;
        /* When taking over an array from a different personality, we store
         * the new thread here until we fully activate the array.
         */
-        struct md_thread        *thread;
+        struct mdk_thread_s     *thread;
 };
+typedef struct r1_private_data_s conf_t;
 /*
 * this is our 'private' RAID1 bio.
 *
@@ -103,7 +79,7 @@ struct r1conf {
 * for this RAID1 operation, and about their status:
 */
-struct r1bio {
+struct r1bio_s {
        atomic_t                remaining; /* 'have we finished' count,
                                            * used from IRQ handlers
                                            */
@@ -113,7 +89,7 @@ struct r1bio {
        sector_t                sector;
        int                     sectors;
        unsigned long           state;
-        struct mddev            *mddev;
+        mddev_t                 *mddev;
        /*
         * original bio going to /dev/mdx
         */
@@ -135,6 +111,20 @@ struct r1bio {
        /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
 };
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context.  So we record
+ * the success by setting bios[n] to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
 /* bits for r1bio.state */
 #define R1BIO_Uptodate  0
 #define R1BIO_IsSync    1
@@ -158,6 +148,6 @@ struct r1bio {
 #define R1BIO_MadeGood 7
 #define R1BIO_WriteError 8
-extern int md_raid1_congested(struct mddev *mddev, int bits);
+extern int md_raid1_congested(mddev_t *mddev, int bits);
 #endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64d48249c03..1d44228530a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -21,10 +21,8 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/blkdev.h>
-#include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
-#include <linux/kthread.h>
 #include "md.h"
 #include "raid10.h"
 #include "raid0.h"
@@ -60,42 +58,15 @@
 */
 #define NR_RAID10_BIOS 256
-/* when we get a read error on a read-only array, we redirect to another
+static void allow_barrier(conf_t *conf);
- * device without failing the first device, or trying to over-write to
+static void lower_barrier(conf_t *conf);
- * correct the read error.  To keep track of bad blocks on a per-bio
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
- */
-#define IO_BLOCKED ((struct bio *)1)
-/* When we successfully write to a known bad-block, we need to remove the
- * bad-block marking which must be done from process context.  So we record
- * the success by setting devs[n].bio to IO_MADE_GOOD
- */
-#define IO_MADE_GOOD ((struct bio *)2)
-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
-/* When there are this many requests queued to be written by
- * the raid10 thread, we become 'congested' to provide back-pressure
- * for writeback.
- */
-static int max_queued_requests = 1024;
-static void allow_barrier(struct r10conf *conf);
-static void lower_barrier(struct r10conf *conf);
-static int enough(struct r10conf *conf, int ignore);
-static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
-                                int *skipped);
-static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
-static void end_reshape_write(struct bio *bio, int error);
-static void end_reshape(struct r10conf *conf);
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
-        struct r10conf *conf = data;
+        conf_t *conf = data;
-        int size = offsetof(struct r10bio, devs[conf->copies]);
+        int size = offsetof(struct r10bio_s, devs[conf->copies]);
-        /* allocate a r10bio with room for raid_disks entries in the
+        /* allocate a r10bio with room for raid_disks entries in the bios array */
-         * bios array */
        return kzalloc(size, gfp_flags);
 }
@@ -121,9 +92,9 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 */
 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
-        struct r10conf *conf = data;
+        conf_t *conf = data;
        struct page *page;
-        struct r10bio *r10_bio;
+        r10bio_t *r10_bio;
        struct bio *bio;
        int i, j;
        int nalloc;
@@ -132,8 +103,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
        if (!r10_bio)
                return NULL;
-        if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
+        if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
-            test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
                nalloc = conf->copies; /* resync */
        else
                nalloc = 2; /* recovery */
@@ -146,25 +116,17 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
                if (!bio)
                        goto out_free_bio;
                r10_bio->devs[j].bio = bio;
-                if (!conf->have_replacement)
-                        continue;
-                bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
-                if (!bio)
-                        goto out_free_bio;
-                r10_bio->devs[j].repl_bio = bio;
        }
        /*
         * Allocate RESYNC_PAGES data pages and attach them
         * where needed.
         */
        for (j = 0 ; j < nalloc; j++) {
-                struct bio *rbio = r10_bio->devs[j].repl_bio;
                bio = r10_bio->devs[j].bio;
                for (i = 0; i < RESYNC_PAGES; i++) {
-                        if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
+                        if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
-                                               &conf->mddev->recovery)) {
+                                                &conf->mddev->recovery)) {
-                                /* we can share bv_page's during recovery
+                                /* we can share bv_page's during recovery */
-                                 * and reshape */
                                struct bio *rbio = r10_bio->devs[0].bio;
                                page = rbio->bi_io_vec[i].bv_page;
                                get_page(page);
@@ -174,8 +136,6 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
                                goto out_free_pages;
                        bio->bi_io_vec[i].bv_page = page;
-                        if (rbio)
-                                rbio->bi_io_vec[i].bv_page = page;
                }
        }
@@ -187,14 +147,10 @@ out_free_pages:
        while (j--)
                for (i = 0; i < RESYNC_PAGES ; i++)
                        safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
-        j = 0;
+        j = -1;
 out_free_bio:
-        for ( ; j < nalloc; j++) {
+        while ( ++j < nalloc )
-                if (r10_bio->devs[j].bio)
+                bio_put(r10_bio->devs[j].bio);
-                        bio_put(r10_bio->devs[j].bio);
-                if (r10_bio->devs[j].repl_bio)
-                        bio_put(r10_bio->devs[j].repl_bio);
-        }
        r10bio_pool_free(r10_bio, conf);
        return NULL;
 }
@@ -202,8 +158,8 @@ out_free_bio:
 static void r10buf_pool_free(void *__r10_bio, void *data)
 {
        int i;
-        struct r10conf *conf = data;
+        conf_t *conf = data;
-        struct r10bio *r10bio = __r10_bio;
+        r10bio_t *r10bio = __r10_bio;
        int j;
        for (j=0; j < conf->copies; j++) {
@@ -215,14 +171,11 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
                        }
                        bio_put(bio);
                }
-                bio = r10bio->devs[j].repl_bio;
-                if (bio)
-                        bio_put(bio);
        }
        r10bio_pool_free(r10bio, conf);
 }
-static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
+static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
 {
        int i;
@@ -231,35 +184,31 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
                if (!BIO_SPECIAL(*bio))
                        bio_put(*bio);
                *bio = NULL;
-                bio = &r10_bio->devs[i].repl_bio;
-                if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
-                        bio_put(*bio);
-                *bio = NULL;
        }
 }
-static void free_r10bio(struct r10bio *r10_bio)
+static void free_r10bio(r10bio_t *r10_bio)
 {
-        struct r10conf *conf = r10_bio->mddev->private;
+        conf_t *conf = r10_bio->mddev->private;
        put_all_bios(conf, r10_bio);
        mempool_free(r10_bio, conf->r10bio_pool);
 }
-static void put_buf(struct r10bio *r10_bio)
+static void put_buf(r10bio_t *r10_bio)
 {
-        struct r10conf *conf = r10_bio->mddev->private;
+        conf_t *conf = r10_bio->mddev->private;
        mempool_free(r10_bio, conf->r10buf_pool);
        lower_barrier(conf);
 }
-static void reschedule_retry(struct r10bio *r10_bio)
+static void reschedule_retry(r10bio_t *r10_bio)
 {
        unsigned long flags;
-        struct mddev *mddev = r10_bio->mddev;
+        mddev_t *mddev = r10_bio->mddev;
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&r10_bio->retry_list, &conf->retry_list);
@@ -277,11 +226,11 @@ static void reschedule_retry(struct r10bio *r10_bio)
 * operation and are ready to return a success/failure code to the buffer
 * cache layer.
 */
-static void raid_end_bio_io(struct r10bio *r10_bio)
+static void raid_end_bio_io(r10bio_t *r10_bio)
 {
        struct bio *bio = r10_bio->master_bio;
        int done;
-        struct r10conf *conf = r10_bio->mddev->private;
+        conf_t *conf = r10_bio->mddev->private;
        if (bio->bi_phys_segments) {
                unsigned long flags;
@@ -307,9 +256,9 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
 /*
 * Update disk head position estimator based on IRQ completion info.
 */
-static inline void update_head_pos(int slot, struct r10bio *r10_bio)
+static inline void update_head_pos(int slot, r10bio_t *r10_bio)
 {
-        struct r10conf *conf = r10_bio->mddev->private;
+        conf_t *conf = r10_bio->mddev->private;
        conf->mirrors[r10_bio->devs[slot].devnum].head_position =
                r10_bio->devs[slot].addr + (r10_bio->sectors);
@@ -318,43 +267,33 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio)
 /*
 * Find the disk number which triggered given bio
 */
-static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
+static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio,
-                         struct bio *bio, int *slotp, int *replp)
+                         struct bio *bio, int *slotp)
 {
        int slot;
-        int repl = 0;
-        for (slot = 0; slot < conf->copies; slot++) {
+        for (slot = 0; slot < conf->copies; slot++)
                if (r10_bio->devs[slot].bio == bio)
                        break;
-                if (r10_bio->devs[slot].repl_bio == bio) {
-                        repl = 1;
-                        break;
-                }
-        }
        BUG_ON(slot == conf->copies);
        update_head_pos(slot, r10_bio);
        if (slotp)
                *slotp = slot;
-        if (replp)
-                *replp = repl;
        return r10_bio->devs[slot].devnum;
 }
 static void raid10_end_read_request(struct bio *bio, int error)
 {
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct r10bio *r10_bio = bio->bi_private;
+        r10bio_t *r10_bio = bio->bi_private;
        int slot, dev;
-        struct md_rdev *rdev;
+        conf_t *conf = r10_bio->mddev->private;
-        struct r10conf *conf = r10_bio->mddev->private;
        slot = r10_bio->read_slot;
        dev = r10_bio->devs[slot].devnum;
-        rdev = r10_bio->devs[slot].rdev;
        /*
         * this branch is our 'one mirror IO has finished' event handler:
         */
@@ -371,21 +310,8 @@ static void raid10_end_read_request(struct bio *bio, int error)
                 * wait for the 'master' bio.
                 */
                set_bit(R10BIO_Uptodate, &r10_bio->state);
-        } else {
-                /* If all other devices that store this block have
-                 * failed, we want to return the error upwards rather
-                 * than fail the last device.  Here we redefine
-                 * "uptodate" to mean "Don't want to retry"
-                 */
-                unsigned long flags;
-                spin_lock_irqsave(&conf->device_lock, flags);
-                if (!enough(conf, rdev->raid_disk))
-                        uptodate = 1;
-                spin_unlock_irqrestore(&conf->device_lock, flags);
-        }
-        if (uptodate) {
                raid_end_bio_io(r10_bio);
-                rdev_dec_pending(rdev, conf->mddev);
+                rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
        } else {
                /*
                 * oops, read error - keep the refcount on the rdev
@@ -394,14 +320,14 @@ static void raid10_end_read_request(struct bio *bio, int error)
                printk_ratelimited(KERN_ERR
                                   "md/raid10:%s: %s: rescheduling sector %llu\n",
                                   mdname(conf->mddev),
-                                   bdevname(rdev->bdev, b),
+                                   bdevname(conf->mirrors[dev].rdev->bdev, b),
                                   (unsigned long long)r10_bio->sector);
                set_bit(R10BIO_ReadError, &r10_bio->state);
                reschedule_retry(r10_bio);
        }
 }
-static void close_write(struct r10bio *r10_bio)
+static void close_write(r10bio_t *r10_bio)
 {
        /* clear the bitmap if all writes complete successfully */
        bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
@@ -411,7 +337,7 @@ static void close_write(struct r10bio *r10_bio)
        md_write_end(r10_bio->mddev);
 }
-static void one_write_done(struct r10bio *r10_bio)
+static void one_write_done(r10bio_t *r10_bio)
 {
        if (atomic_dec_and_test(&r10_bio->remaining)) {
                if (test_bit(R10BIO_WriteError, &r10_bio->state))
@@ -429,39 +355,21 @@ static void one_write_done(struct r10bio *r10_bio)
 static void raid10_end_write_request(struct bio *bio, int error)
 {
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct r10bio *r10_bio = bio->bi_private;
+        r10bio_t *r10_bio = bio->bi_private;
        int dev;
        int dec_rdev = 1;
-        struct r10conf *conf = r10_bio->mddev->private;
+        conf_t *conf = r10_bio->mddev->private;
-        int slot, repl;
+        int slot;
-        struct md_rdev *rdev = NULL;
+        dev = find_bio_disk(conf, r10_bio, bio, &slot);
-        dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
-        if (repl)
-                rdev = conf->mirrors[dev].replacement;
-        if (!rdev) {
-                smp_rmb();
-                repl = 0;
-                rdev = conf->mirrors[dev].rdev;
-        }
        /*
         * this branch is our 'one mirror IO has finished' event handler:
         */
        if (!uptodate) {
-                if (repl)
+                set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags);
-                        /* Never record new bad blocks to replacement,
+                set_bit(R10BIO_WriteError, &r10_bio->state);
-                         * just fail it.
+                dec_rdev = 0;
-                         */
-                        md_error(rdev->mddev, rdev);
-                else {
-                        set_bit(WriteErrorSeen, &rdev->flags);
-                        if (!test_and_set_bit(WantReplacement, &rdev->flags))
-                                set_bit(MD_RECOVERY_NEEDED,
-                                        &rdev->mddev->recovery);
-                        set_bit(R10BIO_WriteError, &r10_bio->state);
-                        dec_rdev = 0;
-                }
        } else {
                /*
                 * Set R10BIO_Uptodate in our master bio, so that
@@ -478,15 +386,12 @@ static void raid10_end_write_request(struct bio *bio, int error)
                set_bit(R10BIO_Uptodate, &r10_bio->state);
                /* Maybe we can clear some bad blocks. */
-                if (is_badblock(rdev,
+                if (is_badblock(conf->mirrors[dev].rdev,
                                r10_bio->devs[slot].addr,
                                r10_bio->sectors,
                                &first_bad, &bad_sectors)) {
                        bio_put(bio);
-                        if (repl)
+                        r10_bio->devs[slot].bio = IO_MADE_GOOD;
-                                r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
-                        else
-                                r10_bio->devs[slot].bio = IO_MADE_GOOD;
                        dec_rdev = 0;
                        set_bit(R10BIO_MadeGood, &r10_bio->state);
                }
@@ -499,9 +404,10 @@ static void raid10_end_write_request(struct bio *bio, int error)
         */
        one_write_done(r10_bio);
        if (dec_rdev)
-                rdev_dec_pending(rdev, conf->mddev);
+                rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 }
 /*
 * RAID10 layout manager
 * As well as the chunksize and raid_disks count, there are two
@@ -527,96 +433,79 @@ static void raid10_end_write_request(struct bio *bio, int error)
 * sector offset to a virtual address
 */
-static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
+static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
 {
        int n,f;
        sector_t sector;
        sector_t chunk;
        sector_t stripe;
        int dev;
        int slot = 0;
        /* now calculate first sector/dev */
-        chunk = r10bio->sector >> geo->chunk_shift;
+        chunk = r10bio->sector >> conf->chunk_shift;
-        sector = r10bio->sector & geo->chunk_mask;
+        sector = r10bio->sector & conf->chunk_mask;
-        chunk *= geo->near_copies;
+        chunk *= conf->near_copies;
        stripe = chunk;
-        dev = sector_div(stripe, geo->raid_disks);
+        dev = sector_div(stripe, conf->raid_disks);
-        if (geo->far_offset)
+        if (conf->far_offset)
-                stripe *= geo->far_copies;
+                stripe *= conf->far_copies;
-        sector += stripe << geo->chunk_shift;
+        sector += stripe << conf->chunk_shift;
        /* and calculate all the others */
-        for (n = 0; n < geo->near_copies; n++) {
+        for (n=0; n < conf->near_copies; n++) {
                int d = dev;
                sector_t s = sector;
                r10bio->devs[slot].addr = sector;
                r10bio->devs[slot].devnum = d;
                slot++;
-                for (f = 1; f < geo->far_copies; f++) {
+                for (f = 1; f < conf->far_copies; f++) {
-                        d += geo->near_copies;
+                        d += conf->near_copies;
-                        if (d >= geo->raid_disks)
+                        if (d >= conf->raid_disks)
-                                d -= geo->raid_disks;
+                                d -= conf->raid_disks;
-                        s += geo->stride;
+                        s += conf->stride;
                        r10bio->devs[slot].devnum = d;
                        r10bio->devs[slot].addr = s;
                        slot++;
                }
                dev++;
-                if (dev >= geo->raid_disks) {
+                if (dev >= conf->raid_disks) {
                        dev = 0;
-                        sector += (geo->chunk_mask + 1);
+                        sector += (conf->chunk_mask + 1);
                }
        }
+        BUG_ON(slot != conf->copies);
 }
-static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
+static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
-{
-        struct geom *geo = &conf->geo;
-        if (conf->reshape_progress != MaxSector &&
-            ((r10bio->sector >= conf->reshape_progress) !=
-             conf->mddev->reshape_backwards)) {
-                set_bit(R10BIO_Previous, &r10bio->state);
-                geo = &conf->prev;
-        } else
-                clear_bit(R10BIO_Previous, &r10bio->state);
-        __raid10_find_phys(geo, r10bio);
-}
-static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 {
        sector_t offset, chunk, vchunk;
-        /* Never use conf->prev as this is only called during resync
-         * or recovery, so reshape isn't happening
-         */
-        struct geom *geo = &conf->geo;
-        offset = sector & geo->chunk_mask;
+        offset = sector & conf->chunk_mask;
-        if (geo->far_offset) {
+        if (conf->far_offset) {
                int fc;
-                chunk = sector >> geo->chunk_shift;
+                chunk = sector >> conf->chunk_shift;
-                fc = sector_div(chunk, geo->far_copies);
+                fc = sector_div(chunk, conf->far_copies);
-                dev -= fc * geo->near_copies;
+                dev -= fc * conf->near_copies;
                if (dev < 0)
-                        dev += geo->raid_disks;
+                        dev += conf->raid_disks;
        } else {
-                while (sector >= geo->stride) {
+                while (sector >= conf->stride) {
-                        sector -= geo->stride;
+                        sector -= conf->stride;
-                        if (dev < geo->near_copies)
+                        if (dev < conf->near_copies)
-                                dev += geo->raid_disks - geo->near_copies;
+                                dev += conf->raid_disks - conf->near_copies;
                        else
-                                dev -= geo->near_copies;
+                                dev -= conf->near_copies;
                }
-                chunk = sector >> geo->chunk_shift;
+                chunk = sector >> conf->chunk_shift;
        }
-        vchunk = chunk * geo->raid_disks + dev;
+        vchunk = chunk * conf->raid_disks + dev;
-        sector_div(vchunk, geo->near_copies);
+        sector_div(vchunk, conf->near_copies);
-        return (vchunk << geo->chunk_shift) + offset;
+        return (vchunk << conf->chunk_shift) + offset;
 }
 /**
@@ -626,85 +515,25 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 *      @biovec: the request that could be merged to it.
 *
 *      Return amount of bytes we can accept at this offset
- *      This requires checking for end-of-chunk if near_copies != raid_disks,
+ *      If near_copies == raid_disk, there are no striping issues,
- *      and for subordinate merge_bvec_fns if merge_check_needed.
+ *      but in that case, the function isn't called at all.
 */
 static int raid10_mergeable_bvec(struct request_queue *q,
                                 struct bvec_merge_data *bvm,
                                 struct bio_vec *biovec)
 {
-        struct mddev *mddev = q->queuedata;
+        mddev_t *mddev = q->queuedata;
-        struct r10conf *conf = mddev->private;
        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
        int max;
-        unsigned int chunk_sectors;
+        unsigned int chunk_sectors = mddev->chunk_sectors;
        unsigned int bio_sectors = bvm->bi_size >> 9;
-        struct geom *geo = &conf->geo;
+        max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
-        chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
+        if (max < 0) max = 0; /* bio_add cannot handle a negative return */
-        if (conf->reshape_progress != MaxSector &&
+        if (max <= biovec->bv_len && bio_sectors == 0)
-            ((sector >= conf->reshape_progress) !=
+                return biovec->bv_len;
-             conf->mddev->reshape_backwards))
+        else
-                geo = &conf->prev;
+                return max;
-        if (geo->near_copies < geo->raid_disks) {
-                max = (chunk_sectors - ((sector & (chunk_sectors - 1))
-                                        + bio_sectors)) << 9;
-                if (max < 0)
-                        /* bio_add cannot handle a negative return */
-                        max = 0;
-                if (max <= biovec->bv_len && bio_sectors == 0)
-                        return biovec->bv_len;
-        } else
-                max = biovec->bv_len;
-        if (mddev->merge_check_needed) {
-                struct {
-                        struct r10bio r10_bio;
-                        struct r10dev devs[conf->copies];
-                } on_stack;
-                struct r10bio *r10_bio = &on_stack.r10_bio;
-                int s;
-                if (conf->reshape_progress != MaxSector) {
-                        /* Cannot give any guidance during reshape */
-                        if (max <= biovec->bv_len && bio_sectors == 0)
-                                return biovec->bv_len;
-                        return 0;
-                }
-                r10_bio->sector = sector;
-                raid10_find_phys(conf, r10_bio);
-                rcu_read_lock();
-                for (s = 0; s < conf->copies; s++) {
-                        int disk = r10_bio->devs[s].devnum;
-                        struct md_rdev *rdev = rcu_dereference(
-                                conf->mirrors[disk].rdev);
-                        if (rdev && !test_bit(Faulty, &rdev->flags)) {
-                                struct request_queue *q =
-                                        bdev_get_queue(rdev->bdev);
-                                if (q->merge_bvec_fn) {
-                                        bvm->bi_sector = r10_bio->devs[s].addr
-                                                + rdev->data_offset;
-                                        bvm->bi_bdev = rdev->bdev;
-                                        max = min(max, q->merge_bvec_fn(
-                                                          q, bvm, biovec));
-                                }
-                        }
-                        rdev = rcu_dereference(conf->mirrors[disk].replacement);
-                        if (rdev && !test_bit(Faulty, &rdev->flags)) {
-                                struct request_queue *q =
-                                        bdev_get_queue(rdev->bdev);
-                                if (q->merge_bvec_fn) {
-                                        bvm->bi_sector = r10_bio->devs[s].addr
-                                                + rdev->data_offset;
-                                        bvm->bi_bdev = rdev->bdev;
-                                        max = min(max, q->merge_bvec_fn(
-                                                          q, bvm, biovec));
-                                }
-                        }
-                }
-                rcu_read_unlock();
-        }
-        return max;
 }
 /*
@@ -726,26 +555,22 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 * FIXME: possibly should rethink readbalancing and do it differently
 * depending on near_copies / far_copies geometry.
 */
-static struct md_rdev *read_balance(struct r10conf *conf,
+static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
-                                    struct r10bio *r10_bio,
-                                    int *max_sectors)
 {
        const sector_t this_sector = r10_bio->sector;
        int disk, slot;
        int sectors = r10_bio->sectors;
        int best_good_sectors;
        sector_t new_distance, best_dist;
-        struct md_rdev *best_rdev, *rdev = NULL;
+        mdk_rdev_t *rdev;
        int do_balance;
        int best_slot;
-        struct geom *geo = &conf->geo;
        raid10_find_phys(conf, r10_bio);
        rcu_read_lock();
 retry:
        sectors = r10_bio->sectors;
        best_slot = -1;
-        best_rdev = NULL;
        best_dist = MaxSector;
        best_good_sectors = 0;
        do_balance = 1;
@@ -767,17 +592,10 @@ retry:
                if (r10_bio->devs[slot].bio == IO_BLOCKED)
                        continue;
                disk = r10_bio->devs[slot].devnum;
-                rdev = rcu_dereference(conf->mirrors[disk].replacement);
+                rdev = rcu_dereference(conf->mirrors[disk].rdev);
-                if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
+                if (rdev == NULL)
-                    test_bit(Unmerged, &rdev->flags) ||
-                    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
-                        rdev = rcu_dereference(conf->mirrors[disk].rdev);
-                if (rdev == NULL ||
-                    test_bit(Faulty, &rdev->flags) ||
-                    test_bit(Unmerged, &rdev->flags))
                        continue;
-                if (!test_bit(In_sync, &rdev->flags) &&
+                if (!test_bit(In_sync, &rdev->flags))
-                    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
                        continue;
                dev_sector = r10_bio->devs[slot].addr;
@@ -802,7 +620,6 @@ retry:
                                if (good_sectors > best_good_sectors) {
                                        best_good_sectors = good_sectors;
                                        best_slot = slot;
-                                        best_rdev = rdev;
                                }
                                if (!do_balance)
                                        /* Must read from here */
@@ -819,11 +636,11 @@ retry:
                 * sequential read speed for 'far copies' arrays.  So only
                 * keep it for 'near' arrays, and review those later.
                 */
-                if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
+                if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
                        break;
                /* for far > 1 always use the lowest address */
-                if (geo->far_copies > 1)
+                if (conf->far_copies > 1)
                        new_distance = r10_bio->devs[slot].addr;
                else
                        new_distance = abs(r10_bio->devs[slot].addr -
@@ -831,15 +648,16 @@ retry:
                if (new_distance < best_dist) {
                        best_dist = new_distance;
                        best_slot = slot;
-                        best_rdev = rdev;
                }
        }
-        if (slot >= conf->copies) {
+        if (slot == conf->copies)
                slot = best_slot;
-                rdev = best_rdev;
-        }
        if (slot >= 0) {
+                disk = r10_bio->devs[slot].devnum;
+                rdev = rcu_dereference(conf->mirrors[disk].rdev);
+                if (!rdev)
+                        goto retry;
                atomic_inc(&rdev->nr_pending);
                if (test_bit(Faulty, &rdev->flags)) {
                        /* Cannot risk returning a device that failed
@@ -850,28 +668,24 @@ retry:
                }
                r10_bio->read_slot = slot;
        } else
-                rdev = NULL;
+                disk = -1;
        rcu_read_unlock();
        *max_sectors = best_good_sectors;
-        return rdev;
+        return disk;
 }
-int md_raid10_congested(struct mddev *mddev, int bits)
+static int raid10_congested(void *data, int bits)
 {
-        struct r10conf *conf = mddev->private;
+        mddev_t *mddev = data;
+        conf_t *conf = mddev->private;
        int i, ret = 0;
-        if ((bits & (1 << BDI_async_congested)) &&
+        if (mddev_congested(mddev, bits))
-            conf->pending_count >= max_queued_requests)
                return 1;
        rcu_read_lock();
-        for (i = 0;
+        for (i = 0; i < conf->raid_disks && ret == 0; i++) {
-             (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
+                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-                     && ret == 0;
-             i++) {
-                struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
                if (rdev && !test_bit(Faulty, &rdev->flags)) {
                        struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -881,17 +695,8 @@ int md_raid10_congested(struct mddev *mddev, int bits)
        rcu_read_unlock();
        return ret;
 }
-EXPORT_SYMBOL_GPL(md_raid10_congested);
-static int raid10_congested(void *data, int bits)
-{
-        struct mddev *mddev = data;
-        return mddev_congested(mddev, bits) ||
-                md_raid10_congested(mddev, bits);
-}
-static void flush_pending_writes(struct r10conf *conf)
+static void flush_pending_writes(conf_t *conf)
 {
        /* Any writes that have been queued but are awaiting
         * bitmap updates get flushed here.
@@ -901,22 +706,15 @@ static void flush_pending_writes(struct r10conf *conf)
        if (conf->pending_bio_list.head) {
                struct bio *bio;
                bio = bio_list_get(&conf->pending_bio_list);
-                conf->pending_count = 0;
                spin_unlock_irq(&conf->device_lock);
                /* flush any pending bitmap writes to disk
                 * before proceeding w/ I/O */
                bitmap_unplug(conf->mddev->bitmap);
-                wake_up(&conf->wait_barrier);
                while (bio) { /* submit pending writes */
                        struct bio *next = bio->bi_next;
                        bio->bi_next = NULL;
-                        if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+                        generic_make_request(bio);
-                            !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
-                                /* Just ignore it */
-                                bio_endio(bio, 0);
-                        else
-                                generic_make_request(bio);
                        bio = next;
                }
        } else
@@ -945,14 +743,14 @@ static void flush_pending_writes(struct r10conf *conf)
 *    lower_barrier when the particular background IO completes.
 */
-static void raise_barrier(struct r10conf *conf, int force)
+static void raise_barrier(conf_t *conf, int force)
 {
        BUG_ON(force && !conf->barrier);
        spin_lock_irq(&conf->resync_lock);
        /* Wait until no block IO is waiting (unless 'force') */
        wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-                            conf->resync_lock);
+                            conf->resync_lock, );
        /* block any new IO from starting */
        conf->barrier++;
@@ -960,12 +758,12 @@ static void raise_barrier(struct r10conf *conf, int force)
        /* Now wait for all pending IO to complete */
        wait_event_lock_irq(conf->wait_barrier,
                            !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-                            conf->resync_lock);
+                            conf->resync_lock, );
        spin_unlock_irq(&conf->resync_lock);
 }
-static void lower_barrier(struct r10conf *conf)
+static void lower_barrier(conf_t *conf)
 {
        unsigned long flags;
        spin_lock_irqsave(&conf->resync_lock, flags);
@@ -974,33 +772,21 @@ static void lower_barrier(struct r10conf *conf)
        wake_up(&conf->wait_barrier);
 }
-static void wait_barrier(struct r10conf *conf)
+static void wait_barrier(conf_t *conf)
 {
        spin_lock_irq(&conf->resync_lock);
        if (conf->barrier) {
                conf->nr_waiting++;
-                /* Wait for the barrier to drop.
+                wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
-                 * However if there are already pending
+                                    conf->resync_lock,
-                 * requests (preventing the barrier from
+                                    );
-                 * rising completely), and the
-                 * pre-process bio queue isn't empty,
-                 * then don't wait, as we need to empty
-                 * that queue to get the nr_pending
-                 * count down.
-                 */
-                wait_event_lock_irq(conf->wait_barrier,
-                                    !conf->barrier ||
-                                    (conf->nr_pending &&
-                                     current->bio_list &&
-                                     !bio_list_empty(current->bio_list)),
-                                    conf->resync_lock);
                conf->nr_waiting--;
        }
        conf->nr_pending++;
        spin_unlock_irq(&conf->resync_lock);
 }
-static void allow_barrier(struct r10conf *conf)
+static void allow_barrier(conf_t *conf)
 {
        unsigned long flags;
        spin_lock_irqsave(&conf->resync_lock, flags);
@@ -1009,7 +795,7 @@ static void allow_barrier(struct r10conf *conf)
        wake_up(&conf->wait_barrier);
 }
-static void freeze_array(struct r10conf *conf)
+static void freeze_array(conf_t *conf)
 {
        /* stop syncio and normal IO and wait for everything to
         * go quiet.
@@ -1026,15 +812,15 @@ static void freeze_array(struct r10conf *conf)
        spin_lock_irq(&conf->resync_lock);
        conf->barrier++;
        conf->nr_waiting++;
-        wait_event_lock_irq_cmd(conf->wait_barrier,
+        wait_event_lock_irq(conf->wait_barrier,
-                                conf->nr_pending == conf->nr_queued+1,
+                            conf->nr_pending == conf->nr_queued+1,
-                                conf->resync_lock,
+                            conf->resync_lock,
-                                flush_pending_writes(conf));
+                            flush_pending_writes(conf));
        spin_unlock_irq(&conf->resync_lock);
 }
-static void unfreeze_array(struct r10conf *conf)
+static void unfreeze_array(conf_t *conf)
 {
        /* reverse the effect of the freeze */
        spin_lock_irq(&conf->resync_lock);
@@ -1044,90 +830,37 @@ static void unfreeze_array(struct r10conf *conf)
        spin_unlock_irq(&conf->resync_lock);
 }
-static sector_t choose_data_offset(struct r10bio *r10_bio,
+static int make_request(mddev_t *mddev, struct bio * bio)
-                                   struct md_rdev *rdev)
-{
-        if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
-            test_bit(R10BIO_Previous, &r10_bio->state))
-                return rdev->data_offset;
-        else
-                return rdev->new_data_offset;
-}
-struct raid10_plug_cb {
-        struct blk_plug_cb      cb;
-        struct bio_list         pending;
-        int                     pending_cnt;
-};
-static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
-{
-        struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
-                                                   cb);
-        struct mddev *mddev = plug->cb.data;
-        struct r10conf *conf = mddev->private;
-        struct bio *bio;
-        if (from_schedule || current->bio_list) {
-                spin_lock_irq(&conf->device_lock);
-                bio_list_merge(&conf->pending_bio_list, &plug->pending);
-                conf->pending_count += plug->pending_cnt;
-                spin_unlock_irq(&conf->device_lock);
-                md_wakeup_thread(mddev->thread);
-                kfree(plug);
-                return;
-        }
-        /* we aren't scheduling, so we can do the write-out directly. */
-        bio = bio_list_get(&plug->pending);
-        bitmap_unplug(mddev->bitmap);
-        wake_up(&conf->wait_barrier);
-        while (bio) { /* submit pending writes */
-                struct bio *next = bio->bi_next;
-                bio->bi_next = NULL;
-                generic_make_request(bio);
-                bio = next;
-        }
-        kfree(plug);
-}
-static void make_request(struct mddev *mddev, struct bio * bio)
 {
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
-        struct r10bio *r10_bio;
+        mirror_info_t *mirror;
+        r10bio_t *r10_bio;
        struct bio *read_bio;
        int i;
-        sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
+        int chunk_sects = conf->chunk_mask + 1;
-        int chunk_sects = chunk_mask + 1;
        const int rw = bio_data_dir(bio);
        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
        const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
-        const unsigned long do_discard = (bio->bi_rw
-                                          & (REQ_DISCARD | REQ_SECURE));
        unsigned long flags;
-        struct md_rdev *blocked_rdev;
+        mdk_rdev_t *blocked_rdev;
-        struct blk_plug_cb *cb;
+        int plugged;
-        struct raid10_plug_cb *plug = NULL;
        int sectors_handled;
        int max_sectors;
-        int sectors;
        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
                md_flush_request(mddev, bio);
-                return;
+                return 0;
        }
        /* If this request crosses a chunk boundary, we need to
         * split it.  This will only happen for 1 PAGE (or less) requests.
         */
-        if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
+        if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
-                     > chunk_sects
+                      > chunk_sects &&
-                     && (conf->geo.near_copies < conf->geo.raid_disks
+                    conf->near_copies < conf->raid_disks)) {
-                         || conf->prev.near_copies < conf->prev.raid_disks))) {
                struct bio_pair *bp;
                /* Sanity check -- queue functions should prevent this happening */
-                if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
+                if (bio->bi_vcnt != 1 ||
                    bio->bi_idx != 0)
                        goto bad_map;
                /* This is a one page bio that upper layers
@@ -1148,8 +881,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
                conf->nr_waiting++;
                spin_unlock_irq(&conf->resync_lock);
-                make_request(mddev, &bp->bio1);
+                if (make_request(mddev, &bp->bio1))
-                make_request(mddev, &bp->bio2);
+                        generic_make_request(&bp->bio1);
+                if (make_request(mddev, &bp->bio2))
+                        generic_make_request(&bp->bio2);
                spin_lock_irq(&conf->resync_lock);
                conf->nr_waiting--;
@@ -1157,14 +892,14 @@ static void make_request(struct mddev *mddev, struct bio * bio)
                spin_unlock_irq(&conf->resync_lock);
                bio_pair_release(bp);
-                return;
+                return 0;
        bad_map:
                printk("md/raid10:%s: make_request bug: can't convert block across chunks"
                       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
                       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
                bio_io_error(bio);
-                return;
+                return 0;
        }
        md_write_start(mddev, bio);
@@ -1176,41 +911,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
         */
        wait_barrier(conf);
-        sectors = bio->bi_size >> 9;
-        while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-            bio->bi_sector < conf->reshape_progress &&
-            bio->bi_sector + sectors > conf->reshape_progress) {
-                /* IO spans the reshape position.  Need to wait for
-                 * reshape to pass
-                 */
-                allow_barrier(conf);
-                wait_event(conf->wait_barrier,
-                           conf->reshape_progress <= bio->bi_sector ||
-                           conf->reshape_progress >= bio->bi_sector + sectors);
-                wait_barrier(conf);
-        }
-        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-            bio_data_dir(bio) == WRITE &&
-            (mddev->reshape_backwards
-             ? (bio->bi_sector < conf->reshape_safe &&
-                bio->bi_sector + sectors > conf->reshape_progress)
-             : (bio->bi_sector + sectors > conf->reshape_safe &&
-                bio->bi_sector < conf->reshape_progress))) {
-                /* Need to update reshape_position in metadata */
-                mddev->reshape_position = conf->reshape_progress;
-                set_bit(MD_CHANGE_DEVS, &mddev->flags);
-                set_bit(MD_CHANGE_PENDING, &mddev->flags);
-                md_wakeup_thread(mddev->thread);
-                wait_event(mddev->sb_wait,
-                           !test_bit(MD_CHANGE_PENDING, &mddev->flags));
-                conf->reshape_safe = mddev->reshape_position;
-        }
        r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
        r10_bio->master_bio = bio;
-        r10_bio->sectors = sectors;
+        r10_bio->sectors = bio->bi_size >> 9;
        r10_bio->mddev = mddev;
        r10_bio->sector = bio->bi_sector;
@@ -1230,27 +934,27 @@ static void make_request(struct mddev *mddev, struct bio * bio)
                /*
                 * read balancing logic:
                 */
-                struct md_rdev *rdev;
+                int disk;
                int slot;
 read_again:
-                rdev = read_balance(conf, r10_bio, &max_sectors);
+                disk = read_balance(conf, r10_bio, &max_sectors);
-                if (!rdev) {
+                slot = r10_bio->read_slot;
+                if (disk < 0) {
                        raid_end_bio_io(r10_bio);
-                        return;
+                        return 0;
                }
-                slot = r10_bio->read_slot;
+                mirror = conf->mirrors + disk;
                read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
                md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
                            max_sectors);
                r10_bio->devs[slot].bio = read_bio;
-                r10_bio->devs[slot].rdev = rdev;
                read_bio->bi_sector = r10_bio->devs[slot].addr +
-                        choose_data_offset(r10_bio, rdev);
+                        mirror->rdev->data_offset;
-                read_bio->bi_bdev = rdev->bdev;
+                read_bio->bi_bdev = mirror->rdev->bdev;
                read_bio->bi_end_io = raid10_end_read_request;
                read_bio->bi_rw = READ | do_sync;
                read_bio->bi_private = r10_bio;
@@ -1286,17 +990,12 @@ read_again:
                        goto read_again;
                } else
                        generic_make_request(read_bio);
-                return;
+                return 0;
        }
        /*
         * WRITE:
         */
-        if (conf->pending_count >= max_queued_requests) {
-                md_wakeup_thread(mddev->thread);
-                wait_event(conf->wait_barrier,
-                           conf->pending_count < max_queued_requests);
-        }
        /* first select target devices under rcu_lock and
         * inc refcount on their rdev.  Record them by setting
         * bios[x] to bio
@@ -1308,8 +1007,8 @@ read_again:
         * of r10_bios is recored in bio->bi_phys_segments just as with
         * the read case.
         */
+        plugged = mddev_check_plugged(mddev);
-        r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
        raid10_find_phys(conf, r10_bio);
 retry_write:
        blocked_rdev = NULL;
@@ -1318,36 +1017,18 @@ retry_write:
        for (i = 0;  i < conf->copies; i++) {
                int d = r10_bio->devs[i].devnum;
-                struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
+                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
-                struct md_rdev *rrdev = rcu_dereference(
-                        conf->mirrors[d].replacement);
-                if (rdev == rrdev)
-                        rrdev = NULL;
                if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
                        atomic_inc(&rdev->nr_pending);
                        blocked_rdev = rdev;
                        break;
                }
-                if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
-                        atomic_inc(&rrdev->nr_pending);
-                        blocked_rdev = rrdev;
-                        break;
-                }
-                if (rdev && (test_bit(Faulty, &rdev->flags)
-                             || test_bit(Unmerged, &rdev->flags)))
-                        rdev = NULL;
-                if (rrdev && (test_bit(Faulty, &rrdev->flags)
-                              || test_bit(Unmerged, &rrdev->flags)))
-                        rrdev = NULL;
                r10_bio->devs[i].bio = NULL;
-                r10_bio->devs[i].repl_bio = NULL;
+                if (!rdev || test_bit(Faulty, &rdev->flags)) {
-                if (!rdev && !rrdev) {
                        set_bit(R10BIO_Degraded, &r10_bio->state);
                        continue;
                }
-                if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
+                if (test_bit(WriteErrorSeen, &rdev->flags)) {
                        sector_t first_bad;
                        sector_t dev_sector = r10_bio->devs[i].addr;
                        int bad_sectors;
@@ -1389,14 +1070,8 @@ retry_write:
                                        max_sectors = good_sectors;
                        }
                }
-                if (rdev) {
+                r10_bio->devs[i].bio = bio;
-                        r10_bio->devs[i].bio = bio;
+                atomic_inc(&rdev->nr_pending);
-                        atomic_inc(&rdev->nr_pending);
-                }
-                if (rrdev) {
-                        r10_bio->devs[i].repl_bio = bio;
-                        atomic_inc(&rrdev->nr_pending);
-                }
        }
        rcu_read_unlock();
@@ -1405,23 +1080,11 @@ retry_write:
                int j;
                int d;
-                for (j = 0; j < i; j++) {
+                for (j = 0; j < i; j++)
                        if (r10_bio->devs[j].bio) {
                                d = r10_bio->devs[j].devnum;
                                rdev_dec_pending(conf->mirrors[d].rdev, mddev);
                        }
-                        if (r10_bio->devs[j].repl_bio) {
-                                struct md_rdev *rdev;
-                                d = r10_bio->devs[j].devnum;
-                                rdev = conf->mirrors[d].replacement;
-                                if (!rdev) {
-                                        /* Race with remove_disk */
-                                        smp_mb();
-                                        rdev = conf->mirrors[d].rdev;
-                                }
-                                rdev_dec_pending(rdev, mddev);
-                        }
-                }
                allow_barrier(conf);
                md_wait_for_blocked_rdev(blocked_rdev, mddev);
                wait_barrier(conf);
@@ -1448,71 +1111,25 @@ retry_write:
        for (i = 0; i < conf->copies; i++) {
                struct bio *mbio;
                int d = r10_bio->devs[i].devnum;
-                if (r10_bio->devs[i].bio) {
+                if (!r10_bio->devs[i].bio)
-                        struct md_rdev *rdev = conf->mirrors[d].rdev;
+                        continue;
-                        mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                        md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-                                    max_sectors);
-                        r10_bio->devs[i].bio = mbio;
-                        mbio->bi_sector = (r10_bio->devs[i].addr+
-                                           choose_data_offset(r10_bio,
-                                                              rdev));
-                        mbio->bi_bdev = rdev->bdev;
-                        mbio->bi_end_io = raid10_end_write_request;
-                        mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
-                        mbio->bi_private = r10_bio;
-                        atomic_inc(&r10_bio->remaining);
-                        cb = blk_check_plugged(raid10_unplug, mddev,
+                mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                                               sizeof(*plug));
+                md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-                        if (cb)
+                            max_sectors);
-                                plug = container_of(cb, struct raid10_plug_cb,
+                r10_bio->devs[i].bio = mbio;
-                                                    cb);
-                        else
-                                plug = NULL;
-                        spin_lock_irqsave(&conf->device_lock, flags);
-                        if (plug) {
-                                bio_list_add(&plug->pending, mbio);
-                                plug->pending_cnt++;
-                        } else {
-                                bio_list_add(&conf->pending_bio_list, mbio);
-                                conf->pending_count++;
-                        }
-                        spin_unlock_irqrestore(&conf->device_lock, flags);
-                        if (!plug)
-                                md_wakeup_thread(mddev->thread);
-                }
-                if (r10_bio->devs[i].repl_bio) {
+                mbio->bi_sector = (r10_bio->devs[i].addr+
-                        struct md_rdev *rdev = conf->mirrors[d].replacement;
+                                   conf->mirrors[d].rdev->data_offset);
-                        if (rdev == NULL) {
+                mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
-                                /* Replacement just got moved to main 'rdev' */
+                mbio->bi_end_io = raid10_end_write_request;
-                                smp_mb();
+                mbio->bi_rw = WRITE | do_sync | do_fua;
-                                rdev = conf->mirrors[d].rdev;
+                mbio->bi_private = r10_bio;
-                        }
-                        mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                        md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-                                    max_sectors);
-                        r10_bio->devs[i].repl_bio = mbio;
-                        mbio->bi_sector = (r10_bio->devs[i].addr +
-                                           choose_data_offset(
-                                                   r10_bio, rdev));
-                        mbio->bi_bdev = rdev->bdev;
-                        mbio->bi_end_io = raid10_end_write_request;
-                        mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
-                        mbio->bi_private = r10_bio;
-                        atomic_inc(&r10_bio->remaining);
+                atomic_inc(&r10_bio->remaining);
-                        spin_lock_irqsave(&conf->device_lock, flags);
+                spin_lock_irqsave(&conf->device_lock, flags);
-                        bio_list_add(&conf->pending_bio_list, mbio);
+                bio_list_add(&conf->pending_bio_list, mbio);
-                        conf->pending_count++;
+                spin_unlock_irqrestore(&conf->device_lock, flags);
-                        spin_unlock_irqrestore(&conf->device_lock, flags);
-                        if (!mddev_check_plugged(mddev))
-                                md_wakeup_thread(mddev->thread);
-                }
        }
        /* Don't remove the bias on 'remaining' (one_write_done) until
@@ -1538,26 +1155,30 @@ retry_write:
        /* In case raid10d snuck in to freeze_array */
        wake_up(&conf->wait_barrier);
+        if (do_sync || !mddev->bitmap || !plugged)
+                md_wakeup_thread(mddev->thread);
+        return 0;
 }
-static void status(struct seq_file *seq, struct mddev *mddev)
+static void status(struct seq_file *seq, mddev_t *mddev)
 {
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int i;
-        if (conf->geo.near_copies < conf->geo.raid_disks)
+        if (conf->near_copies < conf->raid_disks)
                seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
-        if (conf->geo.near_copies > 1)
+        if (conf->near_copies > 1)
-                seq_printf(seq, " %d near-copies", conf->geo.near_copies);
+                seq_printf(seq, " %d near-copies", conf->near_copies);
-        if (conf->geo.far_copies > 1) {
+        if (conf->far_copies > 1) {
-                if (conf->geo.far_offset)
+                if (conf->far_offset)
-                        seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
+                        seq_printf(seq, " %d offset-copies", conf->far_copies);
                else
-                        seq_printf(seq, " %d far-copies", conf->geo.far_copies);
+                        seq_printf(seq, " %d far-copies", conf->far_copies);
        }
-        seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
+        seq_printf(seq, " [%d/%d] [", conf->raid_disks,
-                                        conf->geo.raid_disks - mddev->degraded);
+                                        conf->raid_disks - mddev->degraded);
-        for (i = 0; i < conf->geo.raid_disks; i++)
+        for (i = 0; i < conf->raid_disks; i++)
                seq_printf(seq, "%s",
                              conf->mirrors[i].rdev &&
                              test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
@@ -1569,37 +1190,29 @@ static void status(struct seq_file *seq, struct mddev *mddev)
 * Don't consider the device numbered 'ignore'
 * as we might be about to remove it.
 */
-static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
+static int enough(conf_t *conf, int ignore)
 {
        int first = 0;
        do {
                int n = conf->copies;
                int cnt = 0;
-                int this = first;
                while (n--) {
-                        if (conf->mirrors[this].rdev &&
+                        if (conf->mirrors[first].rdev &&
-                            this != ignore)
+                            first != ignore)
                                cnt++;
-                        this = (this+1) % geo->raid_disks;
+                        first = (first+1) % conf->raid_disks;
                }
                if (cnt == 0)
                        return 0;
-                first = (first + geo->near_copies) % geo->raid_disks;
        } while (first != 0);
        return 1;
 }
-static int enough(struct r10conf *conf, int ignore)
+static void error(mddev_t *mddev, mdk_rdev_t *rdev)
-{
-        return _enough(conf, &conf->geo, ignore) &&
-                _enough(conf, &conf->prev, ignore);
-}
-static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
        char b[BDEVNAME_SIZE];
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        /*
         * If it is not operational, then we have already marked it as dead
@@ -1630,23 +1243,23 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
               "md/raid10:%s: Disk failure on %s, disabling device.\n"
               "md/raid10:%s: Operation continuing on %d devices.\n",
               mdname(mddev), bdevname(rdev->bdev, b),
-               mdname(mddev), conf->geo.raid_disks - mddev->degraded);
+               mdname(mddev), conf->raid_disks - mddev->degraded);
 }
-static void print_conf(struct r10conf *conf)
+static void print_conf(conf_t *conf)
 {
        int i;
-        struct raid10_info *tmp;
+        mirror_info_t *tmp;
        printk(KERN_DEBUG "RAID10 conf printout:\n");
        if (!conf) {
                printk(KERN_DEBUG "(!conf)\n");
                return;
        }
-        printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
+        printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
-                conf->geo.raid_disks);
+                conf->raid_disks);
-        for (i = 0; i < conf->geo.raid_disks; i++) {
+        for (i = 0; i < conf->raid_disks; i++) {
                char b[BDEVNAME_SIZE];
                tmp = conf->mirrors + i;
                if (tmp->rdev)
@@ -1657,7 +1270,7 @@ static void print_conf(struct r10conf *conf)
        }
 }
-static void close_sync(struct r10conf *conf)
+static void close_sync(conf_t *conf)
 {
        wait_barrier(conf);
        allow_barrier(conf);
@@ -1666,11 +1279,11 @@ static void close_sync(struct r10conf *conf)
        conf->r10buf_pool = NULL;
 }
-static int raid10_spare_active(struct mddev *mddev)
+static int raid10_spare_active(mddev_t *mddev)
 {
        int i;
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
-        struct raid10_info *tmp;
+        mirror_info_t *tmp;
        int count = 0;
        unsigned long flags;
@@ -1678,31 +1291,13 @@ static int raid10_spare_active(struct mddev *mddev)
         * Find all non-in_sync disks within the RAID10 configuration
         * and mark them in_sync
         */
-        for (i = 0; i < conf->geo.raid_disks; i++) {
+        for (i = 0; i < conf->raid_disks; i++) {
                tmp = conf->mirrors + i;
-                if (tmp->replacement
+                if (tmp->rdev
-                    && tmp->replacement->recovery_offset == MaxSector
+                    && !test_bit(Faulty, &tmp->rdev->flags)
-                    && !test_bit(Faulty, &tmp->replacement->flags)
+                    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
-                    && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
-                        /* Replacement has just become active */
-                        if (!tmp->rdev
-                            || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
-                                count++;
-                        if (tmp->rdev) {
-                                /* Replaced device not technically faulty,
-                                 * but we need to be sure it gets removed
-                                 * and never re-added.
-                                 */
-                                set_bit(Faulty, &tmp->rdev->flags);
-                                sysfs_notify_dirent_safe(
-                                        tmp->rdev->sysfs_state);
-                        }
-                        sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
-                } else if (tmp->rdev
-                           && !test_bit(Faulty, &tmp->rdev->flags)
-                           && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
                        count++;
-                        sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
+                        sysfs_notify_dirent(tmp->rdev->sysfs_state);
                }
        }
        spin_lock_irqsave(&conf->device_lock, flags);
@@ -1714,60 +1309,52 @@ static int raid10_spare_active(struct mddev *mddev)
 }
-static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
+static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int err = -EEXIST;
        int mirror;
        int first = 0;
-        int last = conf->geo.raid_disks - 1;
+        int last = conf->raid_disks - 1;
-        struct request_queue *q = bdev_get_queue(rdev->bdev);
        if (mddev->recovery_cp < MaxSector)
                /* only hot-add to in-sync arrays, as recovery is
                 * very different from resync
                 */
                return -EBUSY;
-        if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
+        if (!enough(conf, -1))
                return -EINVAL;
        if (rdev->raid_disk >= 0)
                first = last = rdev->raid_disk;
-        if (q->merge_bvec_fn) {
-                set_bit(Unmerged, &rdev->flags);
-                mddev->merge_check_needed = 1;
-        }
        if (rdev->saved_raid_disk >= first &&
            conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
                mirror = rdev->saved_raid_disk;
        else
                mirror = first;
        for ( ; mirror <= last ; mirror++) {
-                struct raid10_info *p = &conf->mirrors[mirror];
+                mirror_info_t *p = &conf->mirrors[mirror];
                if (p->recovery_disabled == mddev->recovery_disabled)
                        continue;
-                if (p->rdev) {
+                if (p->rdev)
-                        if (!test_bit(WantReplacement, &p->rdev->flags) ||
+                        continue;
-                            p->replacement != NULL)
-                                continue;
-                        clear_bit(In_sync, &rdev->flags);
-                        set_bit(Replacement, &rdev->flags);
-                        rdev->raid_disk = mirror;
-                        err = 0;
-                        disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                          rdev->data_offset << 9);
-                        conf->fullsync = 1;
-                        rcu_assign_pointer(p->replacement, rdev);
-                        break;
-                }
                disk_stack_limits(mddev->gendisk, rdev->bdev,
                                  rdev->data_offset << 9);
+                /* as we don't honour merge_bvec_fn, we must
+                 * never risk violating it, so limit
+                 * ->max_segments to one lying with a single
+                 * page, as a one page request is never in
+                 * violation.
+                 */
+                if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+                        blk_queue_max_segments(mddev->queue, 1);
+                        blk_queue_segment_boundary(mddev->queue,
+                                                   PAGE_CACHE_SIZE - 1);
+                }
                p->head_position = 0;
-                p->recovery_disabled = mddev->recovery_disabled - 1;
                rdev->raid_disk = mirror;
                err = 0;
                if (rdev->saved_raid_disk != mirror)
@@ -1775,83 +1362,46 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                rcu_assign_pointer(p->rdev, rdev);
                break;
        }
-        if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
-                /* Some requests might not have seen this new
-                 * merge_bvec_fn.  We must wait for them to complete
-                 * before merging the device fully.
-                 * First we make sure any code which has tested
-                 * our function has submitted the request, then
-                 * we wait for all outstanding requests to complete.
-                 */
-                synchronize_sched();
-                raise_barrier(conf, 0);
-                lower_barrier(conf);
-                clear_bit(Unmerged, &rdev->flags);
-        }
-        md_integrity_add_rdev(rdev, mddev);
-        if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
-                queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+        md_integrity_add_rdev(rdev, mddev);
        print_conf(conf);
        return err;
 }
-static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
+static int raid10_remove_disk(mddev_t *mddev, int number)
 {
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int err = 0;
-        int number = rdev->raid_disk;
+        mdk_rdev_t *rdev;
-        struct md_rdev **rdevp;
+        mirror_info_t *p = conf->mirrors+ number;
-        struct raid10_info *p = conf->mirrors + number;
        print_conf(conf);
-        if (rdev == p->rdev)
+        rdev = p->rdev;
-                rdevp = &p->rdev;
+        if (rdev) {
-        else if (rdev == p->replacement)
+                if (test_bit(In_sync, &rdev->flags) ||
-                rdevp = &p->replacement;
+                    atomic_read(&rdev->nr_pending)) {
-        else
+                        err = -EBUSY;
-                return 0;
+                        goto abort;
+                }
-        if (test_bit(In_sync, &rdev->flags) ||
+                /* Only remove faulty devices in recovery
-            atomic_read(&rdev->nr_pending)) {
+                 * is not possible.
-                err = -EBUSY;
-                goto abort;
-        }
-        /* Only remove faulty devices if recovery
-         * is not possible.
-         */
-        if (!test_bit(Faulty, &rdev->flags) &&
-            mddev->recovery_disabled != p->recovery_disabled &&
-            (!p->replacement || p->replacement == rdev) &&
-            number < conf->geo.raid_disks &&
-            enough(conf, -1)) {
-                err = -EBUSY;
-                goto abort;
-        }
-        *rdevp = NULL;
-        synchronize_rcu();
-        if (atomic_read(&rdev->nr_pending)) {
-                /* lost the race, try later */
-                err = -EBUSY;
-                *rdevp = rdev;
-                goto abort;
-        } else if (p->replacement) {
-                /* We must have just cleared 'rdev' */
-                p->rdev = p->replacement;
-                clear_bit(Replacement, &p->replacement->flags);
-                smp_mb(); /* Make sure other CPUs may see both as identical
-                           * but will never see neither -- if they are careful.
-                           */
-                p->replacement = NULL;
-                clear_bit(WantReplacement, &rdev->flags);
-        } else
-                /* We might have just remove the Replacement as faulty
-                 * Clear the flag just in case
                 */
-                clear_bit(WantReplacement, &rdev->flags);
+                if (!test_bit(Faulty, &rdev->flags) &&
+                    mddev->recovery_disabled != p->recovery_disabled &&
-        err = md_integrity_register(mddev);
+                    enough(conf, -1)) {
+                        err = -EBUSY;
+                        goto abort;
+                }
+                p->rdev = NULL;
+                synchronize_rcu();
+                if (atomic_read(&rdev->nr_pending)) {
+                        /* lost the race, try later */
+                        err = -EBUSY;
+                        p->rdev = rdev;
+                        goto abort;
+                }
+                err = md_integrity_register(mddev);
+        }
 abort:
        print_conf(conf);
@@ -1861,15 +1411,11 @@ abort:
 static void end_sync_read(struct bio *bio, int error)
 {
-        struct r10bio *r10_bio = bio->bi_private;
+        r10bio_t *r10_bio = bio->bi_private;
-        struct r10conf *conf = r10_bio->mddev->private;
+        conf_t *conf = r10_bio->mddev->private;
        int d;
-        if (bio == r10_bio->master_bio) {
+        d = find_bio_disk(conf, r10_bio, bio, NULL);
-                /* this is a reshape read */
-                d = r10_bio->read_slot; /* really the read dev */
-        } else
-                d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
                set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1893,9 +1439,9 @@ static void end_sync_read(struct bio *bio, int error)
        }
 }
-static void end_sync_request(struct r10bio *r10_bio)
+static void end_sync_request(r10bio_t *r10_bio)
 {
-        struct mddev *mddev = r10_bio->mddev;
+        mddev_t *mddev = r10_bio->mddev;
        while (atomic_dec_and_test(&r10_bio->remaining)) {
                if (r10_bio->master_bio == NULL) {
@@ -1909,7 +1455,7 @@ static void end_sync_request(struct r10bio *r10_bio)
                        md_done_sync(mddev, s, 1);
                        break;
                } else {
-                        struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
+                        r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
                        if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
                            test_bit(R10BIO_WriteError, &r10_bio->state))
                                reschedule_retry(r10_bio);
@@ -1923,39 +1469,26 @@ static void end_sync_request(struct r10bio *r10_bio)
 static void end_sync_write(struct bio *bio, int error)
 {
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct r10bio *r10_bio = bio->bi_private;
+        r10bio_t *r10_bio = bio->bi_private;
-        struct mddev *mddev = r10_bio->mddev;
+        mddev_t *mddev = r10_bio->mddev;
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int d;
        sector_t first_bad;
        int bad_sectors;
        int slot;
-        int repl;
-        struct md_rdev *rdev = NULL;
-        d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
+        d = find_bio_disk(conf, r10_bio, bio, &slot);
-        if (repl)
-                rdev = conf->mirrors[d].replacement;
-        else
-                rdev = conf->mirrors[d].rdev;
        if (!uptodate) {
-                if (repl)
+                set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
-                        md_error(mddev, rdev);
+                set_bit(R10BIO_WriteError, &r10_bio->state);
-                else {
+        } else if (is_badblock(conf->mirrors[d].rdev,
-                        set_bit(WriteErrorSeen, &rdev->flags);
-                        if (!test_and_set_bit(WantReplacement, &rdev->flags))
-                                set_bit(MD_RECOVERY_NEEDED,
-                                        &rdev->mddev->recovery);
-                        set_bit(R10BIO_WriteError, &r10_bio->state);
-                }
-        } else if (is_badblock(rdev,
                             r10_bio->devs[slot].addr,
                             r10_bio->sectors,
                             &first_bad, &bad_sectors))
                set_bit(R10BIO_MadeGood, &r10_bio->state);
-        rdev_dec_pending(rdev, mddev);
+        rdev_dec_pending(conf->mirrors[d].rdev, mddev);
        end_sync_request(r10_bio);
 }
@@ -1976,12 +1509,11 @@ static void end_sync_write(struct bio *bio, int error)
 * We check if all blocks are in-sync and only write to blocks that
 * aren't in sync
 */
-static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
+static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 {
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int i, first;
        struct bio *tbio, *fbio;
-        int vcnt;
        atomic_set(&r10_bio->remaining, 1);
@@ -1996,10 +1528,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
        first = i;
        fbio = r10_bio->devs[i].bio;
-        vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
        /* now find blocks with errors */
        for (i=0 ; i < conf->copies ; i++) {
                int  j, d;
+                int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
                tbio = r10_bio->devs[i].bio;
@@ -2015,11 +1547,11 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
                        for (j = 0; j < vcnt; j++)
                                if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
                                           page_address(tbio->bi_io_vec[j].bv_page),
-                                           fbio->bi_io_vec[j].bv_len))
+                                           PAGE_SIZE))
                                        break;
                        if (j == vcnt)
                                continue;
-                        atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
+                        mddev->resync_mismatches += r10_bio->sectors;
                        if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
                                /* Don't fix anything. */
                                continue;
@@ -2060,28 +1592,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
                generic_make_request(tbio);
        }
-        /* Now write out to any replacement devices
-         * that are active
-         */
-        for (i = 0; i < conf->copies; i++) {
-                int j, d;
-                tbio = r10_bio->devs[i].repl_bio;
-                if (!tbio || !tbio->bi_end_io)
-                        continue;
-                if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
-                    && r10_bio->devs[i].bio != fbio)
-                        for (j = 0; j < vcnt; j++)
-                                memcpy(page_address(tbio->bi_io_vec[j].bv_page),
-                                       page_address(fbio->bi_io_vec[j].bv_page),
-                                       PAGE_SIZE);
-                d = r10_bio->devs[i].devnum;
-                atomic_inc(&r10_bio->remaining);
-                md_sync_acct(conf->mirrors[d].replacement->bdev,
-                             tbio->bi_size >> 9);
-                generic_make_request(tbio);
-        }
 done:
        if (atomic_dec_and_test(&r10_bio->remaining)) {
                md_done_sync(mddev, r10_bio->sectors, 1);
@@ -2099,7 +1609,7 @@ done:
 * The second for writing.
 *
 */
-static void fix_recovery_read_error(struct r10bio *r10_bio)
+static void fix_recovery_read_error(r10bio_t *r10_bio)
 {
        /* We got a read error during recovery.
         * We repeat the read in smaller page-sized sections.
@@ -2108,8 +1618,8 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
         * If a read fails, record a bad block on both old and
         * new devices.
         */
-        struct mddev *mddev = r10_bio->mddev;
+        mddev_t *mddev = r10_bio->mddev;
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        struct bio *bio = r10_bio->devs[0].bio;
        sector_t sect = 0;
        int sectors = r10_bio->sectors;
@@ -2119,7 +1629,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
        while (sectors) {
                int s = sectors;
-                struct md_rdev *rdev;
+                mdk_rdev_t *rdev;
                sector_t addr;
                int ok;
@@ -2141,13 +1651,8 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
                                          s << 9,
                                          bio->bi_io_vec[idx].bv_page,
                                          WRITE, false);
-                        if (!ok) {
+                        if (!ok)
                                set_bit(WriteErrorSeen, &rdev->flags);
-                                if (!test_and_set_bit(WantReplacement,
-                                                      &rdev->flags))
-                                        set_bit(MD_RECOVERY_NEEDED,
-                                                &rdev->mddev->recovery);
-                        }
                }
                if (!ok) {
                        /* We don't worry if we cannot set a bad block -
@@ -2158,7 +1663,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
                        if (rdev != conf->mirrors[dw].rdev) {
                                /* need bad block on destination too */
-                                struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
+                                mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev;
                                addr = r10_bio->devs[1].addr + sect;
                                ok = rdev_set_badblocks(rdev2, addr, s, 0);
                                if (!ok) {
@@ -2183,11 +1688,11 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
        }
 }
-static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
+static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 {
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        int d;
-        struct bio *wbio, *wbio2;
+        struct bio *wbio;
        if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
                fix_recovery_read_error(r10_bio);
@@ -2199,20 +1704,12 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
         * share the pages with the first bio
         * and submit the write request
         */
-        d = r10_bio->devs[1].devnum;
        wbio = r10_bio->devs[1].bio;
-        wbio2 = r10_bio->devs[1].repl_bio;
+        d = r10_bio->devs[1].devnum;
-        if (wbio->bi_end_io) {
-                atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+        atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-                md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
+        md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
-                generic_make_request(wbio);
+        generic_make_request(wbio);
-        }
-        if (wbio2 && wbio2->bi_end_io) {
-                atomic_inc(&conf->mirrors[d].replacement->nr_pending);
-                md_sync_acct(conf->mirrors[d].replacement->bdev,
-                             wbio2->bi_size >> 9);
-                generic_make_request(wbio2);
-        }
 }
@@ -2222,7 +1719,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 * since the last recorded read error.
 *
 */
-static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
+static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        struct timespec cur_time_mon;
        unsigned long hours_since_last;
@@ -2253,7 +1750,7 @@ static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
                atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
 }
-static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
+static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
                            int sectors, struct page *page, int rw)
 {
        sector_t first_bad;
@@ -2265,12 +1762,8 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
        if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
                /* success */
                return 1;
-        if (rw == WRITE) {
+        if (rw == WRITE)
                set_bit(WriteErrorSeen, &rdev->flags);
-                if (!test_and_set_bit(WantReplacement, &rdev->flags))
-                        set_bit(MD_RECOVERY_NEEDED,
-                                &rdev->mddev->recovery);
-        }
        /* need to record an error - either for the block or the device */
        if (!rdev_set_badblocks(rdev, sector, sectors, 0))
                md_error(rdev->mddev, rdev);
@@ -2285,11 +1778,11 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
 *      3.      Performs writes following reads for array synchronising.
 */
-static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
+static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 {
        int sect = 0; /* Offset from r10_bio->sector */
        int sectors = r10_bio->sectors;
-        struct md_rdev*rdev;
+        mdk_rdev_t*rdev;
        int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
        int d = r10_bio->devs[r10_bio->read_slot].devnum;
@@ -2318,7 +1811,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                       "md/raid10:%s: %s: Failing raid device\n",
                       mdname(mddev), b);
                md_error(mddev, conf->mirrors[d].rdev);
-                r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
                return;
        }
@@ -2339,7 +1831,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                        d = r10_bio->devs[sl].devnum;
                        rdev = rcu_dereference(conf->mirrors[d].rdev);
                        if (rdev &&
-                            !test_bit(Unmerged, &rdev->flags) &&
                            test_bit(In_sync, &rdev->flags) &&
                            is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
                                        &first_bad, &bad_sectors) == 0) {
@@ -2373,11 +1864,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                                    rdev,
                                    r10_bio->devs[r10_bio->read_slot].addr
                                    + sect,
-                                    s, 0)) {
+                                    s, 0))
                                md_error(mddev, rdev);
-                                r10_bio->devs[r10_bio->read_slot].bio
-                                        = IO_BLOCKED;
-                        }
                        break;
                }
@@ -2393,7 +1881,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                        d = r10_bio->devs[sl].devnum;
                        rdev = rcu_dereference(conf->mirrors[d].rdev);
                        if (!rdev ||
-                            test_bit(Unmerged, &rdev->flags) ||
                            !test_bit(In_sync, &rdev->flags))
                                continue;
@@ -2402,7 +1889,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                        if (r10_sync_page_io(rdev,
                                             r10_bio->devs[sl].addr +
                                             sect,
-                                             s, conf->tmppage, WRITE)
+                                             s<<9, conf->tmppage, WRITE)
                            == 0) {
                                /* Well, this device is dead */
                                printk(KERN_NOTICE
@@ -2411,9 +1898,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                                       " (%d sectors at %llu on %s)\n",
                                       mdname(mddev), s,
                                       (unsigned long long)(
-                                               sect +
+                                               sect + rdev->data_offset),
-                                               choose_data_offset(r10_bio,
-                                                                  rdev)),
                                       bdevname(rdev->bdev, b));
                                printk(KERN_NOTICE "md/raid10:%s: %s: failing "
                                       "drive\n",
@@ -2441,7 +1926,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                        switch (r10_sync_page_io(rdev,
                                             r10_bio->devs[sl].addr +
                                             sect,
-                                             s, conf->tmppage,
+                                             s<<9, conf->tmppage,
                                                 READ)) {
                        case 0:
                                /* Well, this device is dead */
@@ -2451,8 +1936,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                                       " (%d sectors at %llu on %s)\n",
                                       mdname(mddev), s,
                                       (unsigned long long)(
-                                               sect +
+                                               sect + rdev->data_offset),
-                                               choose_data_offset(r10_bio, rdev)),
                                       bdevname(rdev->bdev, b));
                                printk(KERN_NOTICE "md/raid10:%s: %s: failing "
                                       "drive\n",
@@ -2465,8 +1949,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
                                       " (%d sectors at %llu on %s)\n",
                                       mdname(mddev), s,
                                       (unsigned long long)(
-                                               sect +
+                                               sect + rdev->data_offset),
-                                               choose_data_offset(r10_bio, rdev)),
                                       bdevname(rdev->bdev, b));
                                atomic_add(s, &rdev->corrected_errors);
                        }
@@ -2500,12 +1983,12 @@ static int submit_bio_wait(int rw, struct bio *bio)
        return test_bit(BIO_UPTODATE, &bio->bi_flags);
 }
-static int narrow_write_error(struct r10bio *r10_bio, int i)
+static int narrow_write_error(r10bio_t *r10_bio, int i)
 {
        struct bio *bio = r10_bio->master_bio;
-        struct mddev *mddev = r10_bio->mddev;
+        mddev_t *mddev = r10_bio->mddev;
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
-        struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
+        mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
        /* bio has the data to be written to slot 'i' where
         * we just recently had a write error.
         * We repeatedly clone the bio and trim down to one block,
@@ -2540,7 +2023,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
                wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
                md_trim_bio(wbio, sector - bio->bi_sector, sectors);
                wbio->bi_sector = (r10_bio->devs[i].addr+
-                                   choose_data_offset(r10_bio, rdev) +
+                                   rdev->data_offset+
                                   (sector - r10_bio->sector));
                wbio->bi_bdev = rdev->bdev;
                if (submit_bio_wait(WRITE, wbio) == 0)
@@ -2557,12 +2040,13 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
        return ok;
 }
-static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
+static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
 {
        int slot = r10_bio->read_slot;
+        int mirror = r10_bio->devs[slot].devnum;
        struct bio *bio;
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
-        struct md_rdev *rdev = r10_bio->devs[slot].rdev;
+        mdk_rdev_t *rdev;
        char b[BDEVNAME_SIZE];
        unsigned long do_sync;
        int max_sectors;
@@ -2575,36 +2059,37 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
         * This is all done synchronously while the array is
         * frozen.
         */
-        bio = r10_bio->devs[slot].bio;
-        bdevname(bio->bi_bdev, b);
-        bio_put(bio);
-        r10_bio->devs[slot].bio = NULL;
        if (mddev->ro == 0) {
                freeze_array(conf);
                fix_read_error(conf, mddev, r10_bio);
                unfreeze_array(conf);
-        } else
+        }
-                r10_bio->devs[slot].bio = IO_BLOCKED;
+        rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
-        rdev_dec_pending(rdev, mddev);
+        bio = r10_bio->devs[slot].bio;
+        bdevname(bio->bi_bdev, b);
+        r10_bio->devs[slot].bio =
+                mddev->ro ? IO_BLOCKED : NULL;
 read_more:
-        rdev = read_balance(conf, r10_bio, &max_sectors);
+        mirror = read_balance(conf, r10_bio, &max_sectors);
-        if (rdev == NULL) {
+        if (mirror == -1) {
                printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
                       " read error for block %llu\n",
                       mdname(mddev), b,
                       (unsigned long long)r10_bio->sector);
                raid_end_bio_io(r10_bio);
+                bio_put(bio);
                return;
        }
        do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
+        if (bio)
+                bio_put(bio);
        slot = r10_bio->read_slot;
+        rdev = conf->mirrors[mirror].rdev;
        printk_ratelimited(
                KERN_ERR
-                "md/raid10:%s: %s: redirecting "
+                "md/raid10:%s: %s: redirecting"
                "sector %llu to another mirror\n",
                mdname(mddev),
                bdevname(rdev->bdev, b),
@@ -2615,9 +2100,8 @@ read_more:
                    r10_bio->sector - bio->bi_sector,
                    max_sectors);
        r10_bio->devs[slot].bio = bio;
-        r10_bio->devs[slot].rdev = rdev;
        bio->bi_sector = r10_bio->devs[slot].addr
-                + choose_data_offset(r10_bio, rdev);
+                + rdev->data_offset;
        bio->bi_bdev = rdev->bdev;
        bio->bi_rw = READ | do_sync;
        bio->bi_private = r10_bio;
@@ -2636,6 +2120,7 @@ read_more:
                        mbio->bi_phys_segments++;
                spin_unlock_irq(&conf->device_lock);
                generic_make_request(bio);
+                bio = NULL;
                r10_bio = mempool_alloc(conf->r10bio_pool,
                                        GFP_NOIO);
@@ -2654,7 +2139,7 @@ read_more:
                generic_make_request(bio);
 }
-static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
+static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio)
 {
        /* Some sort of write request has finished and it
         * succeeded in writing where we thought there was a
@@ -2663,7 +2148,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
         * a bad block.
         */
        int m;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
            test_bit(R10BIO_IsRecover, &r10_bio->state)) {
@@ -2677,23 +2162,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
                                rdev_clear_badblocks(
                                        rdev,
                                        r10_bio->devs[m].addr,
-                                        r10_bio->sectors, 0);
+                                        r10_bio->sectors);
-                        } else {
-                                if (!rdev_set_badblocks(
-                                            rdev,
-                                            r10_bio->devs[m].addr,
-                                            r10_bio->sectors, 0))
-                                        md_error(conf->mddev, rdev);
-                        }
-                        rdev = conf->mirrors[dev].replacement;
-                        if (r10_bio->devs[m].repl_bio == NULL)
-                                continue;
-                        if (test_bit(BIO_UPTODATE,
-                                     &r10_bio->devs[m].repl_bio->bi_flags)) {
-                                rdev_clear_badblocks(
-                                        rdev,
-                                        r10_bio->devs[m].addr,
-                                        r10_bio->sectors, 0);
                        } else {
                                if (!rdev_set_badblocks(
                                            rdev,
@@ -2712,7 +2181,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
                                rdev_clear_badblocks(
                                        rdev,
                                        r10_bio->devs[m].addr,
-                                        r10_bio->sectors, 0);
+                                        r10_bio->sectors);
                                rdev_dec_pending(rdev, conf->mddev);
                        } else if (bio != NULL &&
                                   !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2723,15 +2192,6 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
                                }
                                rdev_dec_pending(rdev, conf->mddev);
                        }
-                        bio = r10_bio->devs[m].repl_bio;
-                        rdev = conf->mirrors[dev].replacement;
-                        if (rdev && bio == IO_MADE_GOOD) {
-                                rdev_clear_badblocks(
-                                        rdev,
-                                        r10_bio->devs[m].addr,
-                                        r10_bio->sectors, 0);
-                                rdev_dec_pending(rdev, conf->mddev);
-                        }
                }
                if (test_bit(R10BIO_WriteError,
                             &r10_bio->state))
@@ -2740,12 +2200,11 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
        }
 }
-static void raid10d(struct md_thread *thread)
+static void raid10d(mddev_t *mddev)
 {
-        struct mddev *mddev = thread->mddev;
+        r10bio_t *r10_bio;
-        struct r10bio *r10_bio;
        unsigned long flags;
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
        struct blk_plug plug;
@@ -2761,7 +2220,7 @@ static void raid10d(struct md_thread *thread)
                        spin_unlock_irqrestore(&conf->device_lock, flags);
                        break;
                }
-                r10_bio = list_entry(head->prev, struct r10bio, retry_list);
+                r10_bio = list_entry(head->prev, r10bio_t, retry_list);
                list_del(head->prev);
                conf->nr_queued--;
                spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -2771,8 +2230,6 @@ static void raid10d(struct md_thread *thread)
                if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
                    test_bit(R10BIO_WriteError, &r10_bio->state))
                        handle_write_completed(conf, r10_bio);
-                else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
-                        reshape_request_write(mddev, r10_bio);
                else if (test_bit(R10BIO_IsSync, &r10_bio->state))
                        sync_request_write(mddev, r10_bio);
                else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
@@ -2795,17 +2252,12 @@ static void raid10d(struct md_thread *thread)
 }
-static int init_resync(struct r10conf *conf)
+static int init_resync(conf_t *conf)
 {
        int buffs;
-        int i;
        buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
        BUG_ON(conf->r10buf_pool);
-        conf->have_replacement = 0;
-        for (i = 0; i < conf->geo.raid_disks; i++)
-                if (conf->mirrors[i].replacement)
-                        conf->have_replacement = 1;
        conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
        if (!conf->r10buf_pool)
                return -ENOMEM;
@@ -2845,11 +2297,11 @@ static int init_resync(struct r10conf *conf)
 *
 */
-static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
                             int *skipped, int go_faster)
 {
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
-        struct r10bio *r10_bio;
+        r10bio_t *r10_bio;
        struct bio *biolist = NULL, *bio;
        sector_t max_sector, nr_sectors;
        int i;
@@ -2857,7 +2309,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
        sector_t sync_blocks;
        sector_t sectors_skipped = 0;
        int chunks_skipped = 0;
-        sector_t chunk_mask = conf->geo.chunk_mask;
        if (!conf->r10buf_pool)
                if (init_resync(conf))
@@ -2865,8 +2316,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 skipped:
        max_sector = mddev->dev_sectors;
-        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-            test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
                max_sector = mddev->resync_max_sectors;
        if (sector_nr >= max_sector) {
                /* If we aborted, we need to abort the
@@ -2878,47 +2328,25 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                 * we need to convert that to several
                 * virtual addresses.
                 */
-                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
-                        end_reshape(conf);
-                        return 0;
-                }
                if (mddev->curr_resync < max_sector) { /* aborted */
                        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
                                bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
                                                &sync_blocks, 1);
-                        else for (i = 0; i < conf->geo.raid_disks; i++) {
+                        else for (i=0; i<conf->raid_disks; i++) {
                                sector_t sect =
                                        raid10_find_virt(conf, mddev->curr_resync, i);
                                bitmap_end_sync(mddev->bitmap, sect,
                                                &sync_blocks, 1);
                        }
-                } else {
+                } else /* completed sync */
-                        /* completed sync */
-                        if ((!mddev->bitmap || conf->fullsync)
-                            && conf->have_replacement
-                            && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-                                /* Completed a full sync so the replacements
-                                 * are now fully recovered.
-                                 */
-                                for (i = 0; i < conf->geo.raid_disks; i++)
-                                        if (conf->mirrors[i].replacement)
-                                                conf->mirrors[i].replacement
-                                                        ->recovery_offset
-                                                        = MaxSector;
-                        }
                        conf->fullsync = 0;
-                }
                bitmap_close_sync(mddev->bitmap);
                close_sync(conf);
                *skipped = 1;
                return sectors_skipped;
        }
+        if (chunks_skipped >= conf->raid_disks) {
-        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-                return reshape_request(mddev, sector_nr, skipped);
-        if (chunks_skipped >= conf->geo.raid_disks) {
                /* if there has been nothing to do on any drive,
                 * then there is nothing to do at all..
                 */
@@ -2932,9 +2360,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
        /* make sure whole request will fit in a chunk - if chunks
         * are meaningful
         */
-        if (conf->geo.near_copies < conf->geo.raid_disks &&
+        if (conf->near_copies < conf->raid_disks &&
-            max_sector > (sector_nr | chunk_mask))
+            max_sector > (sector_nr | conf->chunk_mask))
-                max_sector = (sector_nr | chunk_mask) + 1;
+                max_sector = (sector_nr | conf->chunk_mask) + 1;
        /*
         * If there is non-resync activity waiting for us then
         * put in a delay to throttle resync.
@@ -2963,42 +2391,29 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                int j;
                r10_bio = NULL;
-                for (i = 0 ; i < conf->geo.raid_disks; i++) {
+                for (i=0 ; i<conf->raid_disks; i++) {
                        int still_degraded;
-                        struct r10bio *rb2;
+                        r10bio_t *rb2;
                        sector_t sect;
                        int must_sync;
                        int any_working;
-                        struct raid10_info *mirror = &conf->mirrors[i];
+                        if (conf->mirrors[i].rdev == NULL ||
-                        if ((mirror->rdev == NULL ||
+                            test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 
-                             test_bit(In_sync, &mirror->rdev->flags))
-                            &&
-                            (mirror->replacement == NULL ||
-                             test_bit(Faulty,
-                                      &mirror->replacement->flags)))
                                continue;
                        still_degraded = 0;
                        /* want to reconstruct this device */
                        rb2 = r10_bio;
                        sect = raid10_find_virt(conf, sector_nr, i);
-                        if (sect >= mddev->resync_max_sectors) {
+                        /* Unless we are doing a full sync, we only need
-                                /* last stripe is not complete - don't
+                         * to recover the block if it is set in the bitmap
-                                 * try to recover this sector.
-                                 */
-                                continue;
-                        }
-                        /* Unless we are doing a full sync, or a replacement
-                         * we only need to recover the block if it is set in
-                         * the bitmap
                         */
                        must_sync = bitmap_start_sync(mddev->bitmap, sect,
                                                      &sync_blocks, 1);
                        if (sync_blocks < max_sync)
                                max_sync = sync_blocks;
                        if (!must_sync &&
-                            mirror->replacement == NULL &&
                            !conf->fullsync) {
                                /* yep, skip the sync_blocks here, but don't assume
                                 * that there will never be anything to do here
@@ -3023,7 +2438,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                        /* Need to check if the array will still be
                         * degraded
                         */
-                        for (j = 0; j < conf->geo.raid_disks; j++)
+                        for (j=0; j<conf->raid_disks; j++)
                                if (conf->mirrors[j].rdev == NULL ||
                                    test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
                                        still_degraded = 1;
@@ -3038,7 +2453,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                int k;
                                int d = r10_bio->devs[j].devnum;
                                sector_t from_addr, to_addr;
-                                struct md_rdev *rdev;
+                                mdk_rdev_t *rdev;
                                sector_t sector, first_bad;
                                int bad_sectors;
                                if (!conf->mirrors[d].rdev ||
@@ -3068,60 +2483,33 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                bio->bi_end_io = end_sync_read;
                                bio->bi_rw = READ;
                                from_addr = r10_bio->devs[j].addr;
-                                bio->bi_sector = from_addr + rdev->data_offset;
+                                bio->bi_sector = from_addr +
-                                bio->bi_bdev = rdev->bdev;
+                                        conf->mirrors[d].rdev->data_offset;
-                                atomic_inc(&rdev->nr_pending);
+                                bio->bi_bdev = conf->mirrors[d].rdev->bdev;
-                                /* and we write to 'i' (if not in_sync) */
+                                atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+                                atomic_inc(&r10_bio->remaining);
+                                /* and we write to 'i' */
                                for (k=0; k<conf->copies; k++)
                                        if (r10_bio->devs[k].devnum == i)
                                                break;
                                BUG_ON(k == conf->copies);
+                                bio = r10_bio->devs[1].bio;
+                                bio->bi_next = biolist;
+                                biolist = bio;
+                                bio->bi_private = r10_bio;
+                                bio->bi_end_io = end_sync_write;
+                                bio->bi_rw = WRITE;
                                to_addr = r10_bio->devs[k].addr;
+                                bio->bi_sector = to_addr +
+                                        conf->mirrors[i].rdev->data_offset;
+                                bio->bi_bdev = conf->mirrors[i].rdev->bdev;
                                r10_bio->devs[0].devnum = d;
                                r10_bio->devs[0].addr = from_addr;
                                r10_bio->devs[1].devnum = i;
                                r10_bio->devs[1].addr = to_addr;
-                                rdev = mirror->rdev;
-                                if (!test_bit(In_sync, &rdev->flags)) {
-                                        bio = r10_bio->devs[1].bio;
-                                        bio->bi_next = biolist;
-                                        biolist = bio;
-                                        bio->bi_private = r10_bio;
-                                        bio->bi_end_io = end_sync_write;
-                                        bio->bi_rw = WRITE;
-                                        bio->bi_sector = to_addr
-                                                + rdev->data_offset;
-                                        bio->bi_bdev = rdev->bdev;
-                                        atomic_inc(&r10_bio->remaining);
-                                } else
-                                        r10_bio->devs[1].bio->bi_end_io = NULL;
-                                /* and maybe write to replacement */
-                                bio = r10_bio->devs[1].repl_bio;
-                                if (bio)
-                                        bio->bi_end_io = NULL;
-                                rdev = mirror->replacement;
-                                /* Note: if rdev != NULL, then bio
-                                 * cannot be NULL as r10buf_pool_alloc will
-                                 * have allocated it.
-                                 * So the second test here is pointless.
-                                 * But it keeps semantic-checkers happy, and
-                                 * this comment keeps human reviewers
-                                 * happy.
-                                 */
-                                if (rdev == NULL || bio == NULL ||
-                                    test_bit(Faulty, &rdev->flags))
-                                        break;
-                                bio->bi_next = biolist;
-                                biolist = bio;
-                                bio->bi_private = r10_bio;
-                                bio->bi_end_io = end_sync_write;
-                                bio->bi_rw = WRITE;
-                                bio->bi_sector = to_addr + rdev->data_offset;
-                                bio->bi_bdev = rdev->bdev;
-                                atomic_inc(&r10_bio->remaining);
                                break;
                        }
                        if (j == conf->copies) {
@@ -3139,16 +2527,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                        for (k = 0; k < conf->copies; k++)
                                                if (r10_bio->devs[k].devnum == i)
                                                        break;
-                                        if (!test_bit(In_sync,
+                                        if (!rdev_set_badblocks(
-                                                      &mirror->rdev->flags)
+                                                    conf->mirrors[i].rdev,
-                                            && !rdev_set_badblocks(
-                                                    mirror->rdev,
-                                                    r10_bio->devs[k].addr,
-                                                    max_sync, 0))
-                                                any_working = 0;
-                                        if (mirror->replacement &&
-                                            !rdev_set_badblocks(
-                                                    mirror->replacement,
                                                    r10_bio->devs[k].addr,
                                                    max_sync, 0))
                                                any_working = 0;
@@ -3159,7 +2539,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                                printk(KERN_INFO "md/raid10:%s: insufficient "
                                                       "working devices for recovery.\n",
                                                       mdname(mddev));
-                                        mirror->recovery_disabled
+                                        conf->mirrors[i].recovery_disabled
                                                = mddev->recovery_disabled;
                                }
                                break;
@@ -3167,8 +2547,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                }
                if (biolist == NULL) {
                        while (r10_bio) {
-                                struct r10bio *rb2 = r10_bio;
+                                r10bio_t *rb2 = r10_bio;
-                                r10_bio = (struct r10bio*) rb2->master_bio;
+                                r10_bio = (r10bio_t*) rb2->master_bio;
                                rb2->master_bio = NULL;
                                put_buf(rb2);
                        }
@@ -3201,16 +2581,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                r10_bio->sector = sector_nr;
                set_bit(R10BIO_IsSync, &r10_bio->state);
                raid10_find_phys(conf, r10_bio);
-                r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
+                r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
-                for (i = 0; i < conf->copies; i++) {
+                for (i=0; i<conf->copies; i++) {
                        int d = r10_bio->devs[i].devnum;
                        sector_t first_bad, sector;
                        int bad_sectors;
-                        if (r10_bio->devs[i].repl_bio)
-                                r10_bio->devs[i].repl_bio->bi_end_io = NULL;
                        bio = r10_bio->devs[i].bio;
                        bio->bi_end_io = NULL;
                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -3226,7 +2603,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                else {
                                        bad_sectors -= (sector - first_bad);
                                        if (max_sync > bad_sectors)
-                                                max_sync = bad_sectors;
+                                                max_sync = max_sync;
                                        continue;
                                }
                        }
@@ -3241,27 +2618,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                conf->mirrors[d].rdev->data_offset;
                        bio->bi_bdev = conf->mirrors[d].rdev->bdev;
                        count++;
-                        if (conf->mirrors[d].replacement == NULL ||
-                            test_bit(Faulty,
-                                     &conf->mirrors[d].replacement->flags))
-                                continue;
-                        /* Need to set up for writing to the replacement */
-                        bio = r10_bio->devs[i].repl_bio;
-                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
-                        sector = r10_bio->devs[i].addr;
-                        atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-                        bio->bi_next = biolist;
-                        biolist = bio;
-                        bio->bi_private = r10_bio;
-                        bio->bi_end_io = end_sync_write;
-                        bio->bi_rw = WRITE;
-                        bio->bi_sector = sector +
-                                conf->mirrors[d].replacement->data_offset;
-                        bio->bi_bdev = conf->mirrors[d].replacement->bdev;
-                        count++;
                }
                if (count < 2) {
@@ -3270,11 +2626,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                if (r10_bio->devs[i].bio->bi_end_io)
                                        rdev_dec_pending(conf->mirrors[d].rdev,
                                                         mddev);
-                                if (r10_bio->devs[i].repl_bio &&
-                                    r10_bio->devs[i].repl_bio->bi_end_io)
-                                        rdev_dec_pending(
-                                                conf->mirrors[d].replacement,
-                                                mddev);
                        }
                        put_buf(r10_bio);
                        biolist = NULL;
@@ -3363,126 +2714,57 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 }
 static sector_t
-raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
        sector_t size;
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        if (!raid_disks)
-                raid_disks = min(conf->geo.raid_disks,
+                raid_disks = conf->raid_disks;
-                                 conf->prev.raid_disks);
        if (!sectors)
                sectors = conf->dev_sectors;
-        size = sectors >> conf->geo.chunk_shift;
+        size = sectors >> conf->chunk_shift;
-        sector_div(size, conf->geo.far_copies);
+        sector_div(size, conf->far_copies);
        size = size * raid_disks;
-        sector_div(size, conf->geo.near_copies);
+        sector_div(size, conf->near_copies);
-        return size << conf->geo.chunk_shift;
+        return size << conf->chunk_shift;
 }
-static void calc_sectors(struct r10conf *conf, sector_t size)
-{
-        /* Calculate the number of sectors-per-device that will
-         * actually be used, and set conf->dev_sectors and
-         * conf->stride
-         */
-        size = size >> conf->geo.chunk_shift;
-        sector_div(size, conf->geo.far_copies);
-        size = size * conf->geo.raid_disks;
-        sector_div(size, conf->geo.near_copies);
-        /* 'size' is now the number of chunks in the array */
-        /* calculate "used chunks per device" */
-        size = size * conf->copies;
-        /* We need to round up when dividing by raid_disks to
-         * get the stride size.
-         */
-        size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
-        conf->dev_sectors = size << conf->geo.chunk_shift;
-        if (conf->geo.far_offset)
-                conf->geo.stride = 1 << conf->geo.chunk_shift;
-        else {
-                sector_div(size, conf->geo.far_copies);
-                conf->geo.stride = size << conf->geo.chunk_shift;
-        }
-}
-enum geo_type {geo_new, geo_old, geo_start};
+static conf_t *setup_conf(mddev_t *mddev)
-static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
 {
+        conf_t *conf = NULL;
        int nc, fc, fo;
-        int layout, chunk, disks;
+        sector_t stride, size;
-        switch (new) {
-        case geo_old:
-                layout = mddev->layout;
-                chunk = mddev->chunk_sectors;
-                disks = mddev->raid_disks - mddev->delta_disks;
-                break;
-        case geo_new:
-                layout = mddev->new_layout;
-                chunk = mddev->new_chunk_sectors;
-                disks = mddev->raid_disks;
-                break;
-        default: /* avoid 'may be unused' warnings */
-        case geo_start: /* new when starting reshape - raid_disks not
-                         * updated yet. */
-                layout = mddev->new_layout;
-                chunk = mddev->new_chunk_sectors;
-                disks = mddev->raid_disks + mddev->delta_disks;
-                break;
-        }
-        if (layout >> 17)
-                return -1;
-        if (chunk < (PAGE_SIZE >> 9) ||
-            !is_power_of_2(chunk))
-                return -2;
-        nc = layout & 255;
-        fc = (layout >> 8) & 255;
-        fo = layout & (1<<16);
-        geo->raid_disks = disks;
-        geo->near_copies = nc;
-        geo->far_copies = fc;
-        geo->far_offset = fo;
-        geo->chunk_mask = chunk - 1;
-        geo->chunk_shift = ffz(~chunk);
-        return nc*fc;
-}
-static struct r10conf *setup_conf(struct mddev *mddev)
-{
-        struct r10conf *conf = NULL;
        int err = -EINVAL;
-        struct geom geo;
-        int copies;
-        copies = setup_geo(&geo, mddev, geo_new);
-        if (copies == -2) {
+        if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
+            !is_power_of_2(mddev->new_chunk_sectors)) {
                printk(KERN_ERR "md/raid10:%s: chunk size must be "
                       "at least PAGE_SIZE(%ld) and be a power of 2.\n",
                       mdname(mddev), PAGE_SIZE);
                goto out;
        }
-        if (copies < 2 || copies > mddev->raid_disks) {
+        nc = mddev->new_layout & 255;
+        fc = (mddev->new_layout >> 8) & 255;
+        fo = mddev->new_layout & (1<<16);
+        if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
+            (mddev->new_layout >> 17)) {
                printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
                       mdname(mddev), mddev->new_layout);
                goto out;
        }
        err = -ENOMEM;
-        conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
+        conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
        if (!conf)
                goto out;
-        /* FIXME calc properly */
+        conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
-        conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
-                                                            max(0,mddev->delta_disks)),
                                GFP_KERNEL);
        if (!conf->mirrors)
                goto out;
@@ -3491,36 +2773,50 @@ static struct r10conf *setup_conf(struct mddev *mddev)
        if (!conf->tmppage)
                goto out;
-        conf->geo = geo;
-        conf->copies = copies;
+        conf->raid_disks = mddev->raid_disks;
+        conf->near_copies = nc;
+        conf->far_copies = fc;
+        conf->copies = nc*fc;
+        conf->far_offset = fo;
+        conf->chunk_mask = mddev->new_chunk_sectors - 1;
+        conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
        conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
                                           r10bio_pool_free, conf);
        if (!conf->r10bio_pool)
                goto out;
-        calc_sectors(conf, mddev->dev_sectors);
+        size = mddev->dev_sectors >> conf->chunk_shift;
-        if (mddev->reshape_position == MaxSector) {
+        sector_div(size, fc);
-                conf->prev = conf->geo;
+        size = size * conf->raid_disks;
-                conf->reshape_progress = MaxSector;
+        sector_div(size, nc);
-        } else {
+        /* 'size' is now the number of chunks in the array */
-                if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
+        /* calculate "used chunks per device" in 'stride' */
-                        err = -EINVAL;
+        stride = size * conf->copies;
-                        goto out;
-                }
+        /* We need to round up when dividing by raid_disks to
-                conf->reshape_progress = mddev->reshape_position;
+         * get the stride size.
-                if (conf->prev.far_offset)
+         */
-                        conf->prev.stride = 1 << conf->prev.chunk_shift;
+        stride += conf->raid_disks - 1;
-                else
+        sector_div(stride, conf->raid_disks);
-                        /* far_copies must be 1 */
-                        conf->prev.stride = conf->dev_sectors;
+        conf->dev_sectors = stride << conf->chunk_shift;
-        }
+        if (fo)
+                stride = 1;
+        else
+                sector_div(stride, fc);
+        conf->stride = stride << conf->chunk_shift;
        spin_lock_init(&conf->device_lock);
        INIT_LIST_HEAD(&conf->retry_list);
        spin_lock_init(&conf->resync_lock);
        init_waitqueue_head(&conf->wait_barrier);
-        conf->thread = md_register_thread(raid10d, mddev, "raid10");
+        conf->thread = md_register_thread(raid10d, mddev, NULL);
        if (!conf->thread)
                goto out;
@@ -3528,9 +2824,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
        return conf;
 out:
-        if (err == -ENOMEM)
+        printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
-                printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
+               mdname(mddev));
-                       mdname(mddev));
        if (conf) {
                if (conf->r10bio_pool)
                        mempool_destroy(conf->r10bio_pool);
@@ -3541,16 +2836,19 @@ static struct r10conf *setup_conf(struct mddev *mddev)
        return ERR_PTR(err);
 }
-static int run(struct mddev *mddev)
+static int run(mddev_t *mddev)
 {
-        struct r10conf *conf;
+        conf_t *conf;
        int i, disk_idx, chunk_size;
-        struct raid10_info *disk;
+        mirror_info_t *disk;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        sector_t size;
-        sector_t min_offset_diff = 0;
-        int first = 1;
+        /*
-        bool discard_supported = false;
+         * copy the already verified devices into our private RAID10
+         * bookkeeping area. [whatever we allocate in run(),
+         * should be freed in stop()]
+         */
        if (mddev->private == NULL) {
                conf = setup_conf(mddev);
@@ -3566,66 +2864,35 @@ static int run(struct mddev *mddev)
        conf->thread = NULL;
        chunk_size = mddev->chunk_sectors << 9;
-        if (mddev->queue) {
+        blk_queue_io_min(mddev->queue, chunk_size);
-                blk_queue_max_discard_sectors(mddev->queue,
+        if (conf->raid_disks % conf->near_copies)
-                                              mddev->chunk_sectors);
+                blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
-                blk_queue_io_min(mddev->queue, chunk_size);
+        else
-                if (conf->geo.raid_disks % conf->geo.near_copies)
+                blk_queue_io_opt(mddev->queue, chunk_size *
-                        blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
+                                 (conf->raid_disks / conf->near_copies));
-                else
-                        blk_queue_io_opt(mddev->queue, chunk_size *
-                                         (conf->geo.raid_disks / conf->geo.near_copies));
-        }
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
-                long long diff;
-                struct request_queue *q;
                disk_idx = rdev->raid_disk;
-                if (disk_idx < 0)
+                if (disk_idx >= conf->raid_disks
-                        continue;
+                    || disk_idx < 0)
-                if (disk_idx >= conf->geo.raid_disks &&
-                    disk_idx >= conf->prev.raid_disks)
                        continue;
                disk = conf->mirrors + disk_idx;
-                if (test_bit(Replacement, &rdev->flags)) {
+                disk->rdev = rdev;
-                        if (disk->replacement)
+                disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                goto out_free_conf;
+                                  rdev->data_offset << 9);
-                        disk->replacement = rdev;
+                /* as we don't honour merge_bvec_fn, we must never risk
-                } else {
+                 * violating it, so limit max_segments to 1 lying
-                        if (disk->rdev)
+                 * within a single page.
-                                goto out_free_conf;
+                 */
-                        disk->rdev = rdev;
+                if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+                        blk_queue_max_segments(mddev->queue, 1);
+                        blk_queue_segment_boundary(mddev->queue,
+                                                   PAGE_CACHE_SIZE - 1);
                }
-                q = bdev_get_queue(rdev->bdev);
-                if (q->merge_bvec_fn)
-                        mddev->merge_check_needed = 1;
-                diff = (rdev->new_data_offset - rdev->data_offset);
-                if (!mddev->reshape_backwards)
-                        diff = -diff;
-                if (diff < 0)
-                        diff = 0;
-                if (first || diff < min_offset_diff)
-                        min_offset_diff = diff;
-                if (mddev->gendisk)
-                        disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                          rdev->data_offset << 9);
                disk->head_position = 0;
-                if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
-                        discard_supported = true;
-        }
-        if (mddev->queue) {
-                if (discard_supported)
-                        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
-                                                mddev->queue);
-                else
-                        queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
-                                                  mddev->queue);
        }
        /* need to check that every block has at least one working mirror */
        if (!enough(conf, -1)) {
@@ -3634,31 +2901,11 @@ static int run(struct mddev *mddev)
                goto out_free_conf;
        }
-        if (conf->reshape_progress != MaxSector) {
-                /* must ensure that shape change is supported */
-                if (conf->geo.far_copies != 1 &&
-                    conf->geo.far_offset == 0)
-                        goto out_free_conf;
-                if (conf->prev.far_copies != 1 &&
-                    conf->geo.far_offset == 0)
-                        goto out_free_conf;
-        }
        mddev->degraded = 0;
-        for (i = 0;
+        for (i = 0; i < conf->raid_disks; i++) {
-             i < conf->geo.raid_disks
-                     || i < conf->prev.raid_disks;
-             i++) {
                disk = conf->mirrors + i;
-                if (!disk->rdev && disk->replacement) {
-                        /* The replacement is all we have - use it */
-                        disk->rdev = disk->replacement;
-                        disk->replacement = NULL;
-                        clear_bit(Replacement, &disk->rdev->flags);
-                }
                if (!disk->rdev ||
                    !test_bit(In_sync, &disk->rdev->flags)) {
                        disk->head_position = 0;
@@ -3666,7 +2913,6 @@ static int run(struct mddev *mddev)
                        if (disk->rdev)
                                conf->fullsync = 1;
                }
-                disk->recovery_disabled = mddev->recovery_disabled - 1;
        }
        if (mddev->recovery_cp != MaxSector)
@@ -3675,8 +2921,8 @@ static int run(struct mddev *mddev)
                       mdname(mddev));
        printk(KERN_INFO
                "md/raid10:%s: active with %d out of %d devices\n",
-                mdname(mddev), conf->geo.raid_disks - mddev->degraded,
+                mdname(mddev), conf->raid_disks - mddev->degraded,
-                conf->geo.raid_disks);
+                conf->raid_disks);
        /*
         * Ok, everything is just fine now
         */
@@ -3685,50 +2931,27 @@ static int run(struct mddev *mddev)
        md_set_array_sectors(mddev, size);
        mddev->resync_max_sectors = size;
-        if (mddev->queue) {
+        mddev->queue->backing_dev_info.congested_fn = raid10_congested;
-                int stripe = conf->geo.raid_disks *
+        mddev->queue->backing_dev_info.congested_data = mddev;
-                        ((mddev->chunk_sectors << 9) / PAGE_SIZE);
-                mddev->queue->backing_dev_info.congested_fn = raid10_congested;
-                mddev->queue->backing_dev_info.congested_data = mddev;
-                /* Calculate max read-ahead size.
+        /* Calculate max read-ahead size.
-                 * We need to readahead at least twice a whole stripe....
+         * We need to readahead at least twice a whole stripe....
-                 * maybe...
+         * maybe...
-                 */
+         */
-                stripe /= conf->geo.near_copies;
+        {
-                if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+                int stripe = conf->raid_disks *
-                        mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+                        ((mddev->chunk_sectors << 9) / PAGE_SIZE);
-                blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
+                stripe /= conf->near_copies;
+                if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
+                        mddev->queue->backing_dev_info.ra_pages = 2* stripe;
        }
+        if (conf->near_copies < conf->raid_disks)
+                blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
        if (md_integrity_register(mddev))
                goto out_free_conf;
-        if (conf->reshape_progress != MaxSector) {
-                unsigned long before_length, after_length;
-                before_length = ((1 << conf->prev.chunk_shift) *
-                                 conf->prev.far_copies);
-                after_length = ((1 << conf->geo.chunk_shift) *
-                                conf->geo.far_copies);
-                if (max(before_length, after_length) > min_offset_diff) {
-                        /* This cannot work */
-                        printk("md/raid10: offset difference not enough to continue reshape\n");
-                        goto out_free_conf;
-                }
-                conf->offset_diff = min_offset_diff;
-                conf->reshape_safe = conf->reshape_progress;
-                clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
-                clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
-                set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-                set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-                mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-                                                        "reshape");
-        }
        return 0;
 out_free_conf:
@@ -3743,18 +2966,15 @@ out:
        return -EIO;
 }
-static int stop(struct mddev *mddev)
+static int stop(mddev_t *mddev)
 {
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        raise_barrier(conf, 0);
        lower_barrier(conf);
        md_unregister_thread(&mddev->thread);
-        if (mddev->queue)
+        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
-                /* the unplug fn references 'conf'*/
-                blk_sync_queue(mddev->queue);
        if (conf->r10bio_pool)
                mempool_destroy(conf->r10bio_pool);
        kfree(conf->mirrors);
@@ -3763,9 +2983,9 @@ static int stop(struct mddev *mddev)
        return 0;
 }
-static void raid10_quiesce(struct mddev *mddev, int state)
+static void raid10_quiesce(mddev_t *mddev, int state)
 {
-        struct r10conf *conf = mddev->private;
+        conf_t *conf = mddev->private;
        switch(state) {
        case 1:
@@ -3777,57 +2997,10 @@ static void raid10_quiesce(struct mddev *mddev, int state)
        }
 }
-static int raid10_resize(struct mddev *mddev, sector_t sectors)
+static void *raid10_takeover_raid0(mddev_t *mddev)
 {
-        /* Resize of 'far' arrays is not supported.
+        mdk_rdev_t *rdev;
-         * For 'near' and 'offset' arrays we can set the
+        conf_t *conf;
-         * number of sectors used to be an appropriate multiple
-         * of the chunk size.
-         * For 'offset', this is far_copies*chunksize.
-         * For 'near' the multiplier is the LCM of
-         * near_copies and raid_disks.
-         * So if far_copies > 1 && !far_offset, fail.
-         * Else find LCM(raid_disks, near_copy)*far_copies and
-         * multiply by chunk_size.  Then round to this number.
-         * This is mostly done by raid10_size()
-         */
-        struct r10conf *conf = mddev->private;
-        sector_t oldsize, size;
-        if (mddev->reshape_position != MaxSector)
-                return -EBUSY;
-        if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
-                return -EINVAL;
-        oldsize = raid10_size(mddev, 0, 0);
-        size = raid10_size(mddev, sectors, 0);
-        if (mddev->external_size &&
-            mddev->array_sectors > size)
-                return -EINVAL;
-        if (mddev->bitmap) {
-                int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
-                if (ret)
-                        return ret;
-        }
-        md_set_array_sectors(mddev, size);
-        set_capacity(mddev->gendisk, mddev->array_sectors);
-        revalidate_disk(mddev->gendisk);
-        if (sectors > mddev->dev_sectors &&
-            mddev->recovery_cp > oldsize) {
-                mddev->recovery_cp = oldsize;
-                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-        }
-        calc_sectors(conf, sectors);
-        mddev->dev_sectors = conf->dev_sectors;
-        mddev->resync_max_sectors = size;
-        return 0;
-}
-static void *raid10_takeover_raid0(struct mddev *mddev)
-{
-        struct md_rdev *rdev;
-        struct r10conf *conf;
        if (mddev->degraded > 0) {
                printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
@@ -3847,7 +3020,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
        conf = setup_conf(mddev);
        if (!IS_ERR(conf)) {
-                rdev_for_each(rdev, mddev)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0)
                                rdev->new_raid_disk = rdev->raid_disk * 2;
                conf->barrier = 1;
@@ -3856,17 +3029,17 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
        return conf;
 }
-static void *raid10_takeover(struct mddev *mddev)
+static void *raid10_takeover(mddev_t *mddev)
 {
-        struct r0conf *raid0_conf;
+        struct raid0_private_data *raid0_priv;
        /* raid10 can take over:
         *  raid0 - providing it has only two drives
         */
        if (mddev->level == 0) {
                /* for raid0 takeover only one zone is supported */
-                raid0_conf = mddev->private;
+                raid0_priv = mddev->private;
-                if (raid0_conf->nr_strip_zones > 1) {
+                if (raid0_priv->nr_strip_zones > 1) {
                        printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
                               " with more than one zone.\n",
                               mdname(mddev));
@@ -3877,763 +3050,7 @@ static void *raid10_takeover(struct mddev *mddev)
        return ERR_PTR(-EINVAL);
 }
-static int raid10_check_reshape(struct mddev *mddev)
+static struct mdk_personality raid10_personality =
-{
-        /* Called when there is a request to change
-         * - layout (to ->new_layout)
-         * - chunk size (to ->new_chunk_sectors)
-         * - raid_disks (by delta_disks)
-         * or when trying to restart a reshape that was ongoing.
-         *
-         * We need to validate the request and possibly allocate
-         * space if that might be an issue later.
-         *
-         * Currently we reject any reshape of a 'far' mode array,
-         * allow chunk size to change if new is generally acceptable,
-         * allow raid_disks to increase, and allow
-         * a switch between 'near' mode and 'offset' mode.
-         */
-        struct r10conf *conf = mddev->private;
-        struct geom geo;
-        if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
-                return -EINVAL;
-        if (setup_geo(&geo, mddev, geo_start) != conf->copies)
-                /* mustn't change number of copies */
-                return -EINVAL;
-        if (geo.far_copies > 1 && !geo.far_offset)
-                /* Cannot switch to 'far' mode */
-                return -EINVAL;
-        if (mddev->array_sectors & geo.chunk_mask)
-                        /* not factor of array size */
-                        return -EINVAL;
-        if (!enough(conf, -1))
-                return -EINVAL;
-        kfree(conf->mirrors_new);
-        conf->mirrors_new = NULL;
-        if (mddev->delta_disks > 0) {
-                /* allocate new 'mirrors' list */
-                conf->mirrors_new = kzalloc(
-                        sizeof(struct raid10_info)
-                        *(mddev->raid_disks +
-                          mddev->delta_disks),
-                        GFP_KERNEL);
-                if (!conf->mirrors_new)
-                        return -ENOMEM;
-        }
-        return 0;
-}
-/*
- * Need to check if array has failed when deciding whether to:
- *  - start an array
- *  - remove non-faulty devices
- *  - add a spare
- *  - allow a reshape
- * This determination is simple when no reshape is happening.
- * However if there is a reshape, we need to carefully check
- * both the before and after sections.
- * This is because some failed devices may only affect one
- * of the two sections, and some non-in_sync devices may
- * be insync in the section most affected by failed devices.
- */
-static int calc_degraded(struct r10conf *conf)
-{
-        int degraded, degraded2;
-        int i;
-        rcu_read_lock();
-        degraded = 0;
-        /* 'prev' section first */
-        for (i = 0; i < conf->prev.raid_disks; i++) {
-                struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
-                if (!rdev || test_bit(Faulty, &rdev->flags))
-                        degraded++;
-                else if (!test_bit(In_sync, &rdev->flags))
-                        /* When we can reduce the number of devices in
-                         * an array, this might not contribute to
-                         * 'degraded'.  It does now.
-                         */
-                        degraded++;
-        }
-        rcu_read_unlock();
-        if (conf->geo.raid_disks == conf->prev.raid_disks)
-                return degraded;
-        rcu_read_lock();
-        degraded2 = 0;
-        for (i = 0; i < conf->geo.raid_disks; i++) {
-                struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
-                if (!rdev || test_bit(Faulty, &rdev->flags))
-                        degraded2++;
-                else if (!test_bit(In_sync, &rdev->flags)) {
-                        /* If reshape is increasing the number of devices,
-                         * this section has already been recovered, so
-                         * it doesn't contribute to degraded.
-                         * else it does.
-                         */
-                        if (conf->geo.raid_disks <= conf->prev.raid_disks)
-                                degraded2++;
-                }
-        }
-        rcu_read_unlock();
-        if (degraded2 > degraded)
-                return degraded2;
-        return degraded;
-}
-static int raid10_start_reshape(struct mddev *mddev)
-{
-        /* A 'reshape' has been requested. This commits
-         * the various 'new' fields and sets MD_RECOVER_RESHAPE
-         * This also checks if there are enough spares and adds them
-         * to the array.
-         * We currently require enough spares to make the final
-         * array non-degraded.  We also require that the difference
-         * between old and new data_offset - on each device - is
-         * enough that we never risk over-writing.
-         */
-        unsigned long before_length, after_length;
-        sector_t min_offset_diff = 0;
-        int first = 1;
-        struct geom new;
-        struct r10conf *conf = mddev->private;
-        struct md_rdev *rdev;
-        int spares = 0;
-        int ret;
-        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
-                return -EBUSY;
-        if (setup_geo(&new, mddev, geo_start) != conf->copies)
-                return -EINVAL;
-        before_length = ((1 << conf->prev.chunk_shift) *
-                         conf->prev.far_copies);
-        after_length = ((1 << conf->geo.chunk_shift) *
-                        conf->geo.far_copies);
-        rdev_for_each(rdev, mddev) {
-                if (!test_bit(In_sync, &rdev->flags)
-                    && !test_bit(Faulty, &rdev->flags))
-                        spares++;
-                if (rdev->raid_disk >= 0) {
-                        long long diff = (rdev->new_data_offset
-                                          - rdev->data_offset);
-                        if (!mddev->reshape_backwards)
-                                diff = -diff;
-                        if (diff < 0)
-                                diff = 0;
-                        if (first || diff < min_offset_diff)
-                                min_offset_diff = diff;
-                }
-        }
-        if (max(before_length, after_length) > min_offset_diff)
-                return -EINVAL;
-        if (spares < mddev->delta_disks)
-                return -EINVAL;
-        conf->offset_diff = min_offset_diff;
-        spin_lock_irq(&conf->device_lock);
-        if (conf->mirrors_new) {
-                memcpy(conf->mirrors_new, conf->mirrors,
-                       sizeof(struct raid10_info)*conf->prev.raid_disks);
-                smp_mb();
-                kfree(conf->mirrors_old); /* FIXME and elsewhere */
-                conf->mirrors_old = conf->mirrors;
-                conf->mirrors = conf->mirrors_new;
-                conf->mirrors_new = NULL;
-        }
-        setup_geo(&conf->geo, mddev, geo_start);
-        smp_mb();
-        if (mddev->reshape_backwards) {
-                sector_t size = raid10_size(mddev, 0, 0);
-                if (size < mddev->array_sectors) {
-                        spin_unlock_irq(&conf->device_lock);
-                        printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
-                               mdname(mddev));
-                        return -EINVAL;
-                }
-                mddev->resync_max_sectors = size;
-                conf->reshape_progress = size;
-        } else
-                conf->reshape_progress = 0;
-        spin_unlock_irq(&conf->device_lock);
-        if (mddev->delta_disks && mddev->bitmap) {
-                ret = bitmap_resize(mddev->bitmap,
-                                    raid10_size(mddev, 0,
-                                                conf->geo.raid_disks),
-                                    0, 0);
-                if (ret)
-                        goto abort;
-        }
-        if (mddev->delta_disks > 0) {
-                rdev_for_each(rdev, mddev)
-                        if (rdev->raid_disk < 0 &&
-                            !test_bit(Faulty, &rdev->flags)) {
-                                if (raid10_add_disk(mddev, rdev) == 0) {
-                                        if (rdev->raid_disk >=
-                                            conf->prev.raid_disks)
-                                                set_bit(In_sync, &rdev->flags);
-                                        else
-                                                rdev->recovery_offset = 0;
-                                        if (sysfs_link_rdev(mddev, rdev))
-                                                /* Failure here  is OK */;
-                                }
-                        } else if (rdev->raid_disk >= conf->prev.raid_disks
-                                   && !test_bit(Faulty, &rdev->flags)) {
-                                /* This is a spare that was manually added */
-                                set_bit(In_sync, &rdev->flags);
-                        }
-        }
-        /* When a reshape changes the number of devices,
-         * ->degraded is measured against the larger of the
-         * pre and  post numbers.
-         */
-        spin_lock_irq(&conf->device_lock);
-        mddev->degraded = calc_degraded(conf);
-        spin_unlock_irq(&conf->device_lock);
-        mddev->raid_disks = conf->geo.raid_disks;
-        mddev->reshape_position = conf->reshape_progress;
-        set_bit(MD_CHANGE_DEVS, &mddev->flags);
-        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
-        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
-        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-        mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-                                                "reshape");
-        if (!mddev->sync_thread) {
-                ret = -EAGAIN;
-                goto abort;
-        }
-        conf->reshape_checkpoint = jiffies;
-        md_wakeup_thread(mddev->sync_thread);
-        md_new_event(mddev);
-        return 0;
-abort:
-        mddev->recovery = 0;
-        spin_lock_irq(&conf->device_lock);
-        conf->geo = conf->prev;
-        mddev->raid_disks = conf->geo.raid_disks;
-        rdev_for_each(rdev, mddev)
-                rdev->new_data_offset = rdev->data_offset;
-        smp_wmb();
-        conf->reshape_progress = MaxSector;
-        mddev->reshape_position = MaxSector;
-        spin_unlock_irq(&conf->device_lock);
-        return ret;
-}
-/* Calculate the last device-address that could contain
- * any block from the chunk that includes the array-address 's'
- * and report the next address.
- * i.e. the address returned will be chunk-aligned and after
- * any data that is in the chunk containing 's'.
- */
-static sector_t last_dev_address(sector_t s, struct geom *geo)
-{
-        s = (s | geo->chunk_mask) + 1;
-        s >>= geo->chunk_shift;
-        s *= geo->near_copies;
-        s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
-        s *= geo->far_copies;
-        s <<= geo->chunk_shift;
-        return s;
-}
-/* Calculate the first device-address that could contain
- * any block from the chunk that includes the array-address 's'.
- * This too will be the start of a chunk
- */
-static sector_t first_dev_address(sector_t s, struct geom *geo)
-{
-        s >>= geo->chunk_shift;
-        s *= geo->near_copies;
-        sector_div(s, geo->raid_disks);
-        s *= geo->far_copies;
-        s <<= geo->chunk_shift;
-        return s;
-}
-static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
-                                int *skipped)
-{
-        /* We simply copy at most one chunk (smallest of old and new)
-         * at a time, possibly less if that exceeds RESYNC_PAGES,
-         * or we hit a bad block or something.
-         * This might mean we pause for normal IO in the middle of
-         * a chunk, but that is not a problem was mddev->reshape_position
-         * can record any location.
-         *
-         * If we will want to write to a location that isn't
-         * yet recorded as 'safe' (i.e. in metadata on disk) then
-         * we need to flush all reshape requests and update the metadata.
-         *
-         * When reshaping forwards (e.g. to more devices), we interpret
-         * 'safe' as the earliest block which might not have been copied
-         * down yet.  We divide this by previous stripe size and multiply
-         * by previous stripe length to get lowest device offset that we
-         * cannot write to yet.
-         * We interpret 'sector_nr' as an address that we want to write to.
-         * From this we use last_device_address() to find where we might
-         * write to, and first_device_address on the  'safe' position.
-         * If this 'next' write position is after the 'safe' position,
-         * we must update the metadata to increase the 'safe' position.
-         *
-         * When reshaping backwards, we round in the opposite direction
-         * and perform the reverse test:  next write position must not be
-         * less than current safe position.
-         *
-         * In all this the minimum difference in data offsets
-         * (conf->offset_diff - always positive) allows a bit of slack,
-         * so next can be after 'safe', but not by more than offset_disk
-         *
-         * We need to prepare all the bios here before we start any IO
-         * to ensure the size we choose is acceptable to all devices.
-         * The means one for each copy for write-out and an extra one for
-         * read-in.
-         * We store the read-in bio in ->master_bio and the others in
-         * ->devs[x].bio and ->devs[x].repl_bio.
-         */
-        struct r10conf *conf = mddev->private;
-        struct r10bio *r10_bio;
-        sector_t next, safe, last;
-        int max_sectors;
-        int nr_sectors;
-        int s;
-        struct md_rdev *rdev;
-        int need_flush = 0;
-        struct bio *blist;
-        struct bio *bio, *read_bio;
-        int sectors_done = 0;
-        if (sector_nr == 0) {
-                /* If restarting in the middle, skip the initial sectors */
-                if (mddev->reshape_backwards &&
-                    conf->reshape_progress < raid10_size(mddev, 0, 0)) {
-                        sector_nr = (raid10_size(mddev, 0, 0)
-                                     - conf->reshape_progress);
-                } else if (!mddev->reshape_backwards &&
-                           conf->reshape_progress > 0)
-                        sector_nr = conf->reshape_progress;
-                if (sector_nr) {
-                        mddev->curr_resync_completed = sector_nr;
-                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
-                        *skipped = 1;
-                        return sector_nr;
-                }
-        }
-        /* We don't use sector_nr to track where we are up to
-         * as that doesn't work well for ->reshape_backwards.
-         * So just use ->reshape_progress.
-         */
-        if (mddev->reshape_backwards) {
-                /* 'next' is the earliest device address that we might
-                 * write to for this chunk in the new layout
-                 */
-                next = first_dev_address(conf->reshape_progress - 1,
-                                         &conf->geo);
-                /* 'safe' is the last device address that we might read from
-                 * in the old layout after a restart
-                 */
-                safe = last_dev_address(conf->reshape_safe - 1,
-                                        &conf->prev);
-                if (next + conf->offset_diff < safe)
-                        need_flush = 1;
-                last = conf->reshape_progress - 1;
-                sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
-                                               & conf->prev.chunk_mask);
-                if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
-                        sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
-        } else {
-                /* 'next' is after the last device address that we
-                 * might write to for this chunk in the new layout
-                 */
-                next = last_dev_address(conf->reshape_progress, &conf->geo);
-                /* 'safe' is the earliest device address that we might
-                 * read from in the old layout after a restart
-                 */
-                safe = first_dev_address(conf->reshape_safe, &conf->prev);
-                /* Need to update metadata if 'next' might be beyond 'safe'
-                 * as that would possibly corrupt data
-                 */
-                if (next > safe + conf->offset_diff)
-                        need_flush = 1;
-                sector_nr = conf->reshape_progress;
-                last  = sector_nr | (conf->geo.chunk_mask
-                                     & conf->prev.chunk_mask);
-                if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
-                        last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
-        }
-        if (need_flush ||
-            time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
-                /* Need to update reshape_position in metadata */
-                wait_barrier(conf);
-                mddev->reshape_position = conf->reshape_progress;
-                if (mddev->reshape_backwards)
-                        mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
-                                - conf->reshape_progress;
-                else
-                        mddev->curr_resync_completed = conf->reshape_progress;
-                conf->reshape_checkpoint = jiffies;
-                set_bit(MD_CHANGE_DEVS, &mddev->flags);
-                md_wakeup_thread(mddev->thread);
-                wait_event(mddev->sb_wait, mddev->flags == 0 ||
-                           kthread_should_stop());
-                conf->reshape_safe = mddev->reshape_position;
-                allow_barrier(conf);
-        }
-read_more:
-        /* Now schedule reads for blocks from sector_nr to last */
-        r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
-        raise_barrier(conf, sectors_done != 0);
-        atomic_set(&r10_bio->remaining, 0);
-        r10_bio->mddev = mddev;
-        r10_bio->sector = sector_nr;
-        set_bit(R10BIO_IsReshape, &r10_bio->state);
-        r10_bio->sectors = last - sector_nr + 1;
-        rdev = read_balance(conf, r10_bio, &max_sectors);
-        BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
-        if (!rdev) {
-                /* Cannot read from here, so need to record bad blocks
-                 * on all the target devices.
-                 */
-                // FIXME
-                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                return sectors_done;
-        }
-        read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
-        read_bio->bi_bdev = rdev->bdev;
-        read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
-                               + rdev->data_offset);
-        read_bio->bi_private = r10_bio;
-        read_bio->bi_end_io = end_sync_read;
-        read_bio->bi_rw = READ;
-        read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-        read_bio->bi_flags |= 1 << BIO_UPTODATE;
-        read_bio->bi_vcnt = 0;
-        read_bio->bi_idx = 0;
-        read_bio->bi_size = 0;
-        r10_bio->master_bio = read_bio;
-        r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
-        /* Now find the locations in the new layout */
-        __raid10_find_phys(&conf->geo, r10_bio);
-        blist = read_bio;
-        read_bio->bi_next = NULL;
-        for (s = 0; s < conf->copies*2; s++) {
-                struct bio *b;
-                int d = r10_bio->devs[s/2].devnum;
-                struct md_rdev *rdev2;
-                if (s&1) {
-                        rdev2 = conf->mirrors[d].replacement;
-                        b = r10_bio->devs[s/2].repl_bio;
-                } else {
-                        rdev2 = conf->mirrors[d].rdev;
-                        b = r10_bio->devs[s/2].bio;
-                }
-                if (!rdev2 || test_bit(Faulty, &rdev2->flags))
-                        continue;
-                b->bi_bdev = rdev2->bdev;
-                b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
-                b->bi_private = r10_bio;
-                b->bi_end_io = end_reshape_write;
-                b->bi_rw = WRITE;
-                b->bi_flags &= ~(BIO_POOL_MASK - 1);
-                b->bi_flags |= 1 << BIO_UPTODATE;
-                b->bi_next = blist;
-                b->bi_vcnt = 0;
-                b->bi_idx = 0;
-                b->bi_size = 0;
-                blist = b;
-        }
-        /* Now add as many pages as possible to all of these bios. */
-        nr_sectors = 0;
-        for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
-                struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
-                int len = (max_sectors - s) << 9;
-                if (len > PAGE_SIZE)
-                        len = PAGE_SIZE;
-                for (bio = blist; bio ; bio = bio->bi_next) {
-                        struct bio *bio2;
-                        if (bio_add_page(bio, page, len, 0))
-                                continue;
-                        /* Didn't fit, must stop */
-                        for (bio2 = blist;
-                             bio2 && bio2 != bio;
-                             bio2 = bio2->bi_next) {
-                                /* Remove last page from this bio */
-                                bio2->bi_vcnt--;
-                                bio2->bi_size -= len;
-                                bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
-                        }
-                        goto bio_full;
-                }
-                sector_nr += len >> 9;
-                nr_sectors += len >> 9;
-        }
-bio_full:
-        r10_bio->sectors = nr_sectors;
-        /* Now submit the read */
-        md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
-        atomic_inc(&r10_bio->remaining);
-        read_bio->bi_next = NULL;
-        generic_make_request(read_bio);
-        sector_nr += nr_sectors;
-        sectors_done += nr_sectors;
-        if (sector_nr <= last)
-                goto read_more;
-        /* Now that we have done the whole section we can
-         * update reshape_progress
-         */
-        if (mddev->reshape_backwards)
-                conf->reshape_progress -= sectors_done;
-        else
-                conf->reshape_progress += sectors_done;
-        return sectors_done;
-}
-static void end_reshape_request(struct r10bio *r10_bio);
-static int handle_reshape_read_error(struct mddev *mddev,
-                                     struct r10bio *r10_bio);
-static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
-{
-        /* Reshape read completed.  Hopefully we have a block
-         * to write out.
-         * If we got a read error then we do sync 1-page reads from
-         * elsewhere until we find the data - or give up.
-         */
-        struct r10conf *conf = mddev->private;
-        int s;
-        if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
-                if (handle_reshape_read_error(mddev, r10_bio) < 0) {
-                        /* Reshape has been aborted */
-                        md_done_sync(mddev, r10_bio->sectors, 0);
-                        return;
-                }
-        /* We definitely have the data in the pages, schedule the
-         * writes.
-         */
-        atomic_set(&r10_bio->remaining, 1);
-        for (s = 0; s < conf->copies*2; s++) {
-                struct bio *b;
-                int d = r10_bio->devs[s/2].devnum;
-                struct md_rdev *rdev;
-                if (s&1) {
-                        rdev = conf->mirrors[d].replacement;
-                        b = r10_bio->devs[s/2].repl_bio;
-                } else {
-                        rdev = conf->mirrors[d].rdev;
-                        b = r10_bio->devs[s/2].bio;
-                }
-                if (!rdev || test_bit(Faulty, &rdev->flags))
-                        continue;
-                atomic_inc(&rdev->nr_pending);
-                md_sync_acct(b->bi_bdev, r10_bio->sectors);
-                atomic_inc(&r10_bio->remaining);
-                b->bi_next = NULL;
-                generic_make_request(b);
-        }
-        end_reshape_request(r10_bio);
-}
-static void end_reshape(struct r10conf *conf)
-{
-        if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
-                return;
-        spin_lock_irq(&conf->device_lock);
-        conf->prev = conf->geo;
-        md_finish_reshape(conf->mddev);
-        smp_wmb();
-        conf->reshape_progress = MaxSector;
-        spin_unlock_irq(&conf->device_lock);
-        /* read-ahead size must cover two whole stripes, which is
-         * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
-         */
-        if (conf->mddev->queue) {
-                int stripe = conf->geo.raid_disks *
-                        ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
-                stripe /= conf->geo.near_copies;
-                if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
-                        conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
-        }
-        conf->fullsync = 0;
-}
-static int handle_reshape_read_error(struct mddev *mddev,
-                                     struct r10bio *r10_bio)
-{
-        /* Use sync reads to get the blocks from somewhere else */
-        int sectors = r10_bio->sectors;
-        struct r10conf *conf = mddev->private;
-        struct {
-                struct r10bio r10_bio;
-                struct r10dev devs[conf->copies];
-        } on_stack;
-        struct r10bio *r10b = &on_stack.r10_bio;
-        int slot = 0;
-        int idx = 0;
-        struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
-        r10b->sector = r10_bio->sector;
-        __raid10_find_phys(&conf->prev, r10b);
-        while (sectors) {
-                int s = sectors;
-                int success = 0;
-                int first_slot = slot;
-                if (s > (PAGE_SIZE >> 9))
-                        s = PAGE_SIZE >> 9;
-                while (!success) {
-                        int d = r10b->devs[slot].devnum;
-                        struct md_rdev *rdev = conf->mirrors[d].rdev;
-                        sector_t addr;
-                        if (rdev == NULL ||
-                            test_bit(Faulty, &rdev->flags) ||
-                            !test_bit(In_sync, &rdev->flags))
-                                goto failed;
-                        addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
-                        success = sync_page_io(rdev,
-                                               addr,
-                                               s << 9,
-                                               bvec[idx].bv_page,
-                                               READ, false);
-                        if (success)
-                                break;
-                failed:
-                        slot++;
-                        if (slot >= conf->copies)
-                                slot = 0;
-                        if (slot == first_slot)
-                                break;
-                }
-                if (!success) {
-                        /* couldn't read this block, must give up */
-                        set_bit(MD_RECOVERY_INTR,
-                                &mddev->recovery);
-                        return -EIO;
-                }
-                sectors -= s;
-                idx++;
-        }
-        return 0;
-}
-static void end_reshape_write(struct bio *bio, int error)
-{
-        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct r10bio *r10_bio = bio->bi_private;
-        struct mddev *mddev = r10_bio->mddev;
-        struct r10conf *conf = mddev->private;
-        int d;
-        int slot;
-        int repl;
-        struct md_rdev *rdev = NULL;
-        d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
-        if (repl)
-                rdev = conf->mirrors[d].replacement;
-        if (!rdev) {
-                smp_mb();
-                rdev = conf->mirrors[d].rdev;
-        }
-        if (!uptodate) {
-                /* FIXME should record badblock */
-                md_error(mddev, rdev);
-        }
-        rdev_dec_pending(rdev, mddev);
-        end_reshape_request(r10_bio);
-}
-static void end_reshape_request(struct r10bio *r10_bio)
-{
-        if (!atomic_dec_and_test(&r10_bio->remaining))
-                return;
-        md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
-        bio_put(r10_bio->master_bio);
-        put_buf(r10_bio);
-}
-static void raid10_finish_reshape(struct mddev *mddev)
-{
-        struct r10conf *conf = mddev->private;
-        if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
-                return;
-        if (mddev->delta_disks > 0) {
-                sector_t size = raid10_size(mddev, 0, 0);
-                md_set_array_sectors(mddev, size);
-                if (mddev->recovery_cp > mddev->resync_max_sectors) {
-                        mddev->recovery_cp = mddev->resync_max_sectors;
-                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-                }
-                mddev->resync_max_sectors = size;
-                set_capacity(mddev->gendisk, mddev->array_sectors);
-                revalidate_disk(mddev->gendisk);
-        } else {
-                int d;
-                for (d = conf->geo.raid_disks ;
-                     d < conf->geo.raid_disks - mddev->delta_disks;
-                     d++) {
-                        struct md_rdev *rdev = conf->mirrors[d].rdev;
-                        if (rdev)
-                                clear_bit(In_sync, &rdev->flags);
-                        rdev = conf->mirrors[d].replacement;
-                        if (rdev)
-                                clear_bit(In_sync, &rdev->flags);
-                }
-        }
-        mddev->layout = mddev->new_layout;
-        mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
-        mddev->reshape_position = MaxSector;
-        mddev->delta_disks = 0;
-        mddev->reshape_backwards = 0;
-}
-static struct md_personality raid10_personality =
 {
        .name           = "raid10",
        .level          = 10,
@@ -4649,11 +3066,7 @@ static struct md_personality raid10_personality =
        .sync_request   = sync_request,
        .quiesce        = raid10_quiesce,
        .size           = raid10_size,
-        .resize         = raid10_resize,
        .takeover       = raid10_takeover,
-        .check_reshape  = raid10_check_reshape,
-        .start_reshape  = raid10_start_reshape,
-        .finish_reshape = raid10_finish_reshape,
 };
 static int __init raid_init(void)
@@ -4673,5 +3086,3 @@ MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
 MODULE_ALIAS("md-raid10");
 MODULE_ALIAS("md-level-10");
-module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1054cf60234..79cb52a0d4a 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -1,8 +1,10 @@
 #ifndef _RAID10_H
 #define _RAID10_H
-struct raid10_info {
+typedef struct mirror_info mirror_info_t;
-        struct md_rdev  *rdev, *replacement;
+struct mirror_info {
+        mdk_rdev_t      *rdev;
        sector_t        head_position;
        int             recovery_disabled;      /* matches
                                                 * mddev->recovery_disabled
@@ -11,72 +13,66 @@ struct raid10_info {
                                                 */
 };
-struct r10conf {
+typedef struct r10bio_s r10bio_t;
-        struct mddev            *mddev;
-        struct raid10_info      *mirrors;
+struct r10_private_data_s {
-        struct raid10_info      *mirrors_new, *mirrors_old;
+        mddev_t                 *mddev;
+        mirror_info_t           *mirrors;
+        int                     raid_disks;
        spinlock_t              device_lock;
        /* geometry */
-        struct geom {
+        int                     near_copies;  /* number of copies laid out raid0 style */
-                int             raid_disks;
+        int                     far_copies;   /* number of copies laid out
-                int             near_copies;  /* number of copies laid out
-                                               * raid0 style */
-                int             far_copies;   /* number of copies laid out
                                               * at large strides across drives
                                               */
-                int             far_offset;   /* far_copies are offset by 1
+        int                     far_offset;   /* far_copies are offset by 1 stripe
-                                               * stripe instead of many
+                                               * instead of many
+                                               */
+        int                     copies;       /* near_copies * far_copies.
+                                               * must be <= raid_disks
                                               */
-                sector_t        stride;       /* distance between far copies.
+        sector_t                stride;       /* distance between far copies.
                                               * This is size / far_copies unless
                                               * far_offset, in which case it is
                                               * 1 stripe.
                                               */
-                int             chunk_shift; /* shift from chunks to sectors */
-                sector_t        chunk_mask;
-        } prev, geo;
-        int                     copies;       /* near_copies * far_copies.
-                                               * must be <= raid_disks
-                                               */
-        sector_t                dev_sectors;  /* temp copy of
+        sector_t                dev_sectors;  /* temp copy of mddev->dev_sectors */
-                                               * mddev->dev_sectors */
-        sector_t                reshape_progress;
+        int chunk_shift; /* shift from chunks to sectors */
-        sector_t                reshape_safe;
+        sector_t chunk_mask;
-        unsigned long           reshape_checkpoint;
-        sector_t                offset_diff;
        struct list_head        retry_list;
        /* queue pending writes and submit them on unplug */
        struct bio_list         pending_bio_list;
-        int                     pending_count;
        spinlock_t              resync_lock;
-        int                     nr_pending;
+        int nr_pending;
-        int                     nr_waiting;
+        int nr_waiting;
-        int                     nr_queued;
+        int nr_queued;
-        int                     barrier;
+        int barrier;
        sector_t                next_resync;
        int                     fullsync;  /* set to 1 if a full sync is needed,
                                            * (fresh device added).
                                            * Cleared when a sync completes.
                                            */
-        int                     have_replacement; /* There is at least one
-                                                   * replacement device.
-                                                   */
        wait_queue_head_t       wait_barrier;
-        mempool_t               *r10bio_pool;
+        mempool_t *r10bio_pool;
-        mempool_t               *r10buf_pool;
+        mempool_t *r10buf_pool;
        struct page             *tmppage;
        /* When taking over an array from a different personality, we store
         * the new thread here until we fully activate the array.
         */
-        struct md_thread        *thread;
+        struct mdk_thread_s     *thread;
 };
+typedef struct r10_private_data_s conf_t;
 /*
 * this is our 'private' RAID10 bio.
 *
@@ -84,14 +80,14 @@ struct r10conf {
 * for this RAID10 operation, and about their status:
 */
-struct r10bio {
+struct r10bio_s {
        atomic_t                remaining; /* 'have we finished' count,
                                            * used from IRQ handlers
                                            */
        sector_t                sector; /* virtual sector number */
        int                     sectors;
        unsigned long           state;
-        struct mddev            *mddev;
+        mddev_t                 *mddev;
        /*
         * original bio going to /dev/mdx
         */
@@ -108,44 +104,40 @@ struct r10bio {
         * When resyncing we also use one for each copy.
         * When reconstructing, we use 2 bios, one for read, one for write.
         * We choose the number when they are allocated.
-         * We sometimes need an extra bio to write to the replacement.
         */
-        struct r10dev {
+        struct {
-                struct bio      *bio;
+                struct bio              *bio;
-                union {
+                sector_t addr;
-                        struct bio      *repl_bio; /* used for resync and
+                int devnum;
-                                                    * writes */
-                        struct md_rdev  *rdev;     /* used for reads
-                                                    * (read_slot >= 0) */
-                };
-                sector_t        addr;
-                int             devnum;
        } devs[0];
 };
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context.  So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
 /* bits for r10bio.state */
-enum r10bio_state {
+#define R10BIO_Uptodate 0
-        R10BIO_Uptodate,
+#define R10BIO_IsSync   1
-        R10BIO_IsSync,
+#define R10BIO_IsRecover 2
-        R10BIO_IsRecover,
+#define R10BIO_Degraded 3
-        R10BIO_IsReshape,
-        R10BIO_Degraded,
 /* Set ReadError on bios that experience a read error
 * so that raid10d knows what to do with them.
 */
-        R10BIO_ReadError,
+#define R10BIO_ReadError 4
 /* If a write for this request means we can clear some
 * known-bad-block records, we set this flag.
 */
-        R10BIO_MadeGood,
+#define R10BIO_MadeGood 5
-        R10BIO_WriteError,
+#define R10BIO_WriteError 6
-/* During a reshape we might be performing IO on the
- * 'previous' part of the array, in which case this
- * flag is set
- */
-        R10BIO_Previous,
-};
-extern int md_raid10_congested(struct mddev *mddev, int bits);
 #endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 19d77a02663..b6200c3935c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -47,14 +47,11 @@
 #include <linux/kthread.h>
 #include <linux/raid/pq.h>
 #include <linux/async_tx.h>
-#include <linux/module.h>
 #include <linux/async.h>
 #include <linux/seq_file.h>
 #include <linux/cpu.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
-#include <trace/events/block.h>
 #include "md.h"
 #include "raid5.h"
 #include "raid0.h"
@@ -73,11 +70,7 @@
 #define NR_HASH                 (PAGE_SIZE / sizeof(struct hlist_head))
 #define HASH_MASK               (NR_HASH - 1)
-static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
+#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
-{
-        int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
-        return &conf->stripe_hashtbl[hash];
-}
 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
 * order without overlap.  There may be several bio's per stripe+device, and
@@ -85,56 +78,57 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
 * When walking this list for a particular stripe+device, we must never proceed
 * beyond a bio that extends past this device, as the next bio might no longer
 * be valid.
- * This function is used to determine the 'next' bio in the list, given the sector
+ * This macro is used to determine the 'next' bio in the list, given the sector
 * of the current stripe+device
 */
-static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
+#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
-{
+/*
-        int sectors = bio->bi_size >> 9;
+ * The following can be used to debug the driver
-        if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
+ */
-                return bio->bi_next;
+#define RAID5_PARANOIA  1
-        else
+#if RAID5_PARANOIA && defined(CONFIG_SMP)
-                return NULL;
+# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
-}
+#else
+# define CHECK_DEVLOCK()
+#endif
+#ifdef DEBUG
+#define inline
+#define __inline__
+#endif
 /*
 * We maintain a biased count of active stripes in the bottom 16 bits of
 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
 */
-static inline int raid5_bi_processed_stripes(struct bio *bio)
+static inline int raid5_bi_phys_segments(struct bio *bio)
 {
-        atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+        return bio->bi_phys_segments & 0xffff;
-        return (atomic_read(segments) >> 16) & 0xffff;
 }
-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
+static inline int raid5_bi_hw_segments(struct bio *bio)
 {
-        atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+        return (bio->bi_phys_segments >> 16) & 0xffff;
-        return atomic_sub_return(1, segments) & 0xffff;
 }
-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
+static inline int raid5_dec_bi_phys_segments(struct bio *bio)
 {
-        atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+        --bio->bi_phys_segments;
-        atomic_inc(segments);
+        return raid5_bi_phys_segments(bio);
 }
-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
+static inline int raid5_dec_bi_hw_segments(struct bio *bio)
-        unsigned int cnt)
 {
-        atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+        unsigned short val = raid5_bi_hw_segments(bio);
-        int old, new;
-        do {
+        --val;
-                old = atomic_read(segments);
+        bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
-                new = (old & 0xffff) | (cnt << 16);
+        return val;
-        } while (atomic_cmpxchg(segments, old, new) != old);
 }
-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
+static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
 {
-        atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+        bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
-        atomic_set(segments, cnt);
 }
 /* Find first data disk in a raid6 stripe */
@@ -184,14 +178,12 @@ static void return_io(struct bio *return_bi)
                return_bi = bi->bi_next;
                bi->bi_next = NULL;
                bi->bi_size = 0;
-                trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-                                         bi, 0);
                bio_endio(bi, 0);
                bi = return_bi;
        }
 }
-static void print_raid5_conf (struct r5conf *conf);
+static void print_raid5_conf (raid5_conf_t *conf);
 static int stripe_operations_active(struct stripe_head *sh)
 {
@@ -200,56 +192,48 @@ static int stripe_operations_active(struct stripe_head *sh)
               test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 }
-static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
-{
+{
-        BUG_ON(!list_empty(&sh->lru));
+        if (atomic_dec_and_test(&sh->count)) {
-        BUG_ON(atomic_read(&conf->active_stripes)==0);
+                BUG_ON(!list_empty(&sh->lru));
-        if (test_bit(STRIPE_HANDLE, &sh->state)) {
+                BUG_ON(atomic_read(&conf->active_stripes)==0);
-                if (test_bit(STRIPE_DELAYED, &sh->state) &&
+                if (test_bit(STRIPE_HANDLE, &sh->state)) {
-                    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                        if (test_bit(STRIPE_DELAYED, &sh->state))
-                        list_add_tail(&sh->lru, &conf->delayed_list);
+                                list_add_tail(&sh->lru, &conf->delayed_list);
-                else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+                        else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
-                           sh->bm_seq - conf->seq_write > 0)
+                                   sh->bm_seq - conf->seq_write > 0)
-                        list_add_tail(&sh->lru, &conf->bitmap_list);
+                                list_add_tail(&sh->lru, &conf->bitmap_list);
-                else {
+                        else {
-                        clear_bit(STRIPE_DELAYED, &sh->state);
+                                clear_bit(STRIPE_BIT_DELAY, &sh->state);
-                        clear_bit(STRIPE_BIT_DELAY, &sh->state);
+                                list_add_tail(&sh->lru, &conf->handle_list);
-                        list_add_tail(&sh->lru, &conf->handle_list);
+                        }
-                }
+                        md_wakeup_thread(conf->mddev->thread);
-                md_wakeup_thread(conf->mddev->thread);
+                } else {
-        } else {
+                        BUG_ON(stripe_operations_active(sh));
-                BUG_ON(stripe_operations_active(sh));
+                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-                if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                                atomic_dec(&conf->preread_active_stripes);
-                        if (atomic_dec_return(&conf->preread_active_stripes)
+                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-                            < IO_THRESHOLD)
+                                        md_wakeup_thread(conf->mddev->thread);
-                                md_wakeup_thread(conf->mddev->thread);
+                        }
-                atomic_dec(&conf->active_stripes);
+                        atomic_dec(&conf->active_stripes);
-                if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+                        if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
-                        list_add_tail(&sh->lru, &conf->inactive_list);
+                                list_add_tail(&sh->lru, &conf->inactive_list);
-                        wake_up(&conf->wait_for_stripe);
+                                wake_up(&conf->wait_for_stripe);
-                        if (conf->retry_read_aligned)
+                                if (conf->retry_read_aligned)
-                                md_wakeup_thread(conf->mddev->thread);
+                                        md_wakeup_thread(conf->mddev->thread);
+                        }
                }
        }
 }
-static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
-{
-        if (atomic_dec_and_test(&sh->count))
-                do_release_stripe(conf, sh);
-}
 static void release_stripe(struct stripe_head *sh)
 {
-        struct r5conf *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
        unsigned long flags;
-        local_irq_save(flags);
+        spin_lock_irqsave(&conf->device_lock, flags);
-        if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
+        __release_stripe(conf, sh);
-                do_release_stripe(conf, sh);
+        spin_unlock_irqrestore(&conf->device_lock, flags);
-                spin_unlock(&conf->device_lock);
-        }
-        local_irq_restore(flags);
 }
 static inline void remove_hash(struct stripe_head *sh)
@@ -260,23 +244,25 @@ static inline void remove_hash(struct stripe_head *sh)
        hlist_del_init(&sh->hash);
 }
-static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
+static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
 {
        struct hlist_head *hp = stripe_hash(conf, sh->sector);
        pr_debug("insert_hash(), stripe %llu\n",
                (unsigned long long)sh->sector);
+        CHECK_DEVLOCK();
        hlist_add_head(&sh->hash, hp);
 }
 /* find an idle stripe, make sure it is unhashed, and return it. */
-static struct stripe_head *get_free_stripe(struct r5conf *conf)
+static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
 {
        struct stripe_head *sh = NULL;
        struct list_head *first;
+        CHECK_DEVLOCK();
        if (list_empty(&conf->inactive_list))
                goto out;
        first = conf->inactive_list.next;
@@ -320,18 +306,19 @@ static int grow_buffers(struct stripe_head *sh)
 }
 static void raid5_build_block(struct stripe_head *sh, int i, int previous);
-static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
+static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
                            struct stripe_head *sh);
 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 {
-        struct r5conf *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
        int i;
        BUG_ON(atomic_read(&sh->count) != 0);
        BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
        BUG_ON(stripe_operations_active(sh));
+        CHECK_DEVLOCK();
        pr_debug("init_stripe called, stripe %llu\n",
                (unsigned long long)sh->sector);
@@ -361,12 +348,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
        insert_hash(conf, sh);
 }
-static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
                                         short generation)
 {
        struct stripe_head *sh;
        struct hlist_node *hn;
+        CHECK_DEVLOCK();
        pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
        hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
                if (sh->sector == sector && sh->generation == generation)
@@ -388,17 +376,17 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 * of the two sections, and some non-in_sync devices may
 * be insync in the section most affected by failed devices.
 */
-static int calc_degraded(struct r5conf *conf)
+static int has_failed(raid5_conf_t *conf)
 {
-        int degraded, degraded2;
+        int degraded;
        int i;
+        if (conf->mddev->reshape_position == MaxSector)
+                return conf->mddev->degraded > conf->max_degraded;
        rcu_read_lock();
        degraded = 0;
        for (i = 0; i < conf->previous_raid_disks; i++) {
-                struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
+                mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-                if (rdev && test_bit(Faulty, &rdev->flags))
-                        rdev = rcu_dereference(conf->disks[i].replacement);
                if (!rdev || test_bit(Faulty, &rdev->flags))
                        degraded++;
                else if (test_bit(In_sync, &rdev->flags))
@@ -417,16 +405,14 @@ static int calc_degraded(struct r5conf *conf)
                                degraded++;
        }
        rcu_read_unlock();
-        if (conf->raid_disks == conf->previous_raid_disks)
+        if (degraded > conf->max_degraded)
-                return degraded;
+                return 1;
        rcu_read_lock();
-        degraded2 = 0;
+        degraded = 0;
        for (i = 0; i < conf->raid_disks; i++) {
-                struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
+                mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-                if (rdev && test_bit(Faulty, &rdev->flags))
-                        rdev = rcu_dereference(conf->disks[i].replacement);
                if (!rdev || test_bit(Faulty, &rdev->flags))
-                        degraded2++;
+                        degraded++;
                else if (test_bit(In_sync, &rdev->flags))
                        ;
                else
@@ -436,29 +422,16 @@ static int calc_degraded(struct r5conf *conf)
                         * almost certainly hasn't.
                         */
                        if (conf->raid_disks <= conf->previous_raid_disks)
-                                degraded2++;
+                                degraded++;
        }
        rcu_read_unlock();
-        if (degraded2 > degraded)
-                return degraded2;
-        return degraded;
-}
-static int has_failed(struct r5conf *conf)
-{
-        int degraded;
-        if (conf->mddev->reshape_position == MaxSector)
-                return conf->mddev->degraded > conf->max_degraded;
-        degraded = calc_degraded(conf);
        if (degraded > conf->max_degraded)
                return 1;
        return 0;
 }
 static struct stripe_head *
-get_active_stripe(struct r5conf *conf, sector_t sector,
+get_active_stripe(raid5_conf_t *conf, sector_t sector,
                  int previous, int noblock, int noquiesce)
 {
        struct stripe_head *sh;
@@ -470,7 +443,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
        do {
                wait_event_lock_irq(conf->wait_for_stripe,
                                    conf->quiesce == 0 || noquiesce,
-                                    conf->device_lock);
+                                    conf->device_lock, /* nothing */);
                sh = __find_stripe(conf, sector, conf->generation - previous);
                if (!sh) {
                        if (!conf->inactive_blocked)
@@ -484,15 +457,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                                    (atomic_read(&conf->active_stripes)
                                                     < (conf->max_nr_stripes *3/4)
                                                     || !conf->inactive_blocked),
-                                                    conf->device_lock);
+                                                    conf->device_lock,
+                                                    );
                                conf->inactive_blocked = 0;
                        } else
                                init_stripe(sh, sector, previous);
                } else {
                        if (atomic_read(&sh->count)) {
                                BUG_ON(!list_empty(&sh->lru)
-                                    && !test_bit(STRIPE_EXPANDING, &sh->state)
+                                    && !test_bit(STRIPE_EXPANDING, &sh->state));
-                                    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state));
                        } else {
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
@@ -511,27 +484,6 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
        return sh;
 }
-/* Determine if 'data_offset' or 'new_data_offset' should be used
- * in this stripe_head.
- */
-static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
-{
-        sector_t progress = conf->reshape_progress;
-        /* Need a memory barrier to make sure we see the value
-         * of conf->generation, or ->data_offset that was set before
-         * reshape_progress was updated.
-         */
-        smp_rmb();
-        if (progress == MaxSector)
-                return 0;
-        if (sh->generation == conf->generation - 1)
-                return 0;
-        /* We are in a reshape, and this is a new-generation stripe,
-         * so use new_data_offset.
-         */
-        return 1;
-}
 static void
 raid5_end_read_request(struct bio *bi, int error);
 static void
@@ -539,78 +491,43 @@ raid5_end_write_request(struct bio *bi, int error);
 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 {
-        struct r5conf *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
        int i, disks = sh->disks;
        might_sleep();
        for (i = disks; i--; ) {
                int rw;
-                int replace_only = 0;
+                struct bio *bi;
-                struct bio *bi, *rbi;
+                mdk_rdev_t *rdev;
-                struct md_rdev *rdev, *rrdev = NULL;
                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
                        if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
                                rw = WRITE_FUA;
                        else
                                rw = WRITE;
-                        if (test_bit(R5_Discard, &sh->dev[i].flags))
-                                rw |= REQ_DISCARD;
                } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
                        rw = READ;
-                else if (test_and_clear_bit(R5_WantReplace,
+                else
-                                            &sh->dev[i].flags)) {
-                        rw = WRITE;
-                        replace_only = 1;
-                } else
                        continue;
-                if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
-                        rw |= REQ_SYNC;
                bi = &sh->dev[i].req;
-                rbi = &sh->dev[i].rreq; /* For writing to replacement */
                bi->bi_rw = rw;
-                rbi->bi_rw = rw;
+                if (rw & WRITE)
-                if (rw & WRITE) {
                        bi->bi_end_io = raid5_end_write_request;
-                        rbi->bi_end_io = raid5_end_write_request;
+                else
-                } else
                        bi->bi_end_io = raid5_end_read_request;
                rcu_read_lock();
-                rrdev = rcu_dereference(conf->disks[i].replacement);
-                smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
                rdev = rcu_dereference(conf->disks[i].rdev);
-                if (!rdev) {
-                        rdev = rrdev;
-                        rrdev = NULL;
-                }
-                if (rw & WRITE) {
-                        if (replace_only)
-                                rdev = NULL;
-                        if (rdev == rrdev)
-                                /* We raced and saw duplicates */
-                                rrdev = NULL;
-                } else {
-                        if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
-                                rdev = rrdev;
-                        rrdev = NULL;
-                }
                if (rdev && test_bit(Faulty, &rdev->flags))
                        rdev = NULL;
                if (rdev)
                        atomic_inc(&rdev->nr_pending);
-                if (rrdev && test_bit(Faulty, &rrdev->flags))
-                        rrdev = NULL;
-                if (rrdev)
-                        atomic_inc(&rrdev->nr_pending);
                rcu_read_unlock();
                /* We have already checked bad blocks for reads.  Now
-                 * need to check for writes.  We never accept write errors
+                 * need to check for writes.
-                 * on the replacement, so we don't to check rrdev.
                 */
                while ((rw & WRITE) && rdev &&
                       test_bit(WriteErrorSeen, &rdev->flags)) {
@@ -631,12 +548,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                         * a chance*/
                                        md_check_recovery(conf->mddev);
                                }
-                                /*
-                                 * Because md_wait_for_blocked_rdev
-                                 * will dec nr_pending, we must
-                                 * increment it first.
-                                 */
-                                atomic_inc(&rdev->nr_pending);
                                md_wait_for_blocked_rdev(rdev, conf->mddev);
                        } else {
                                /* Acknowledged bad block - skip the write */
@@ -646,8 +557,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                }
                if (rdev) {
-                        if (s->syncing || s->expanding || s->expanded
+                        if (s->syncing || s->expanding || s->expanded)
-                            || s->replacing)
                                md_sync_acct(rdev->bdev, STRIPE_SECTORS);
                        set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -657,59 +567,18 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                __func__, (unsigned long long)sh->sector,
                                bi->bi_rw, i);
                        atomic_inc(&sh->count);
-                        if (use_new_offset(conf, sh))
+                        bi->bi_sector = sh->sector + rdev->data_offset;
-                                bi->bi_sector = (sh->sector
-                                                 + rdev->new_data_offset);
-                        else
-                                bi->bi_sector = (sh->sector
-                                                 + rdev->data_offset);
-                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
-                                bi->bi_rw |= REQ_FLUSH;
                        bi->bi_flags = 1 << BIO_UPTODATE;
+                        bi->bi_vcnt = 1;
+                        bi->bi_max_vecs = 1;
                        bi->bi_idx = 0;
+                        bi->bi_io_vec = &sh->dev[i].vec;
                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                        bi->bi_io_vec[0].bv_offset = 0;
                        bi->bi_size = STRIPE_SIZE;
                        bi->bi_next = NULL;
-                        if (rrdev)
-                                set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
-                        trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
-                                              bi, disk_devt(conf->mddev->gendisk),
-                                              sh->dev[i].sector);
                        generic_make_request(bi);
-                }
+                } else {
-                if (rrdev) {
-                        if (s->syncing || s->expanding || s->expanded
-                            || s->replacing)
-                                md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
-                        set_bit(STRIPE_IO_STARTED, &sh->state);
-                        rbi->bi_bdev = rrdev->bdev;
-                        pr_debug("%s: for %llu schedule op %ld on "
-                                 "replacement disc %d\n",
-                                __func__, (unsigned long long)sh->sector,
-                                rbi->bi_rw, i);
-                        atomic_inc(&sh->count);
-                        if (use_new_offset(conf, sh))
-                                rbi->bi_sector = (sh->sector
-                                                  + rrdev->new_data_offset);
-                        else
-                                rbi->bi_sector = (sh->sector
-                                                  + rrdev->data_offset);
-                        rbi->bi_flags = 1 << BIO_UPTODATE;
-                        rbi->bi_idx = 0;
-                        rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
-                        rbi->bi_io_vec[0].bv_offset = 0;
-                        rbi->bi_size = STRIPE_SIZE;
-                        rbi->bi_next = NULL;
-                        trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
-                                              rbi, disk_devt(conf->mddev->gendisk),
-                                              sh->dev[i].sector);
-                        generic_make_request(rbi);
-                }
-                if (!rdev && !rrdev) {
                        if (rw & WRITE)
                                set_bit(STRIPE_DEGRADED, &sh->state);
                        pr_debug("skip op %ld on disc %d for sector %llu\n",
@@ -781,12 +650,14 @@ static void ops_complete_biofill(void *stripe_head_ref)
 {
        struct stripe_head *sh = stripe_head_ref;
        struct bio *return_bi = NULL;
+        raid5_conf_t *conf = sh->raid_conf;
        int i;
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
        /* clear completed biofills */
+        spin_lock_irq(&conf->device_lock);
        for (i = sh->disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
@@ -804,7 +675,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
                        while (rbi && rbi->bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                rbi2 = r5_next_bio(rbi, dev->sector);
-                                if (!raid5_dec_bi_active_stripes(rbi)) {
+                                if (!raid5_dec_bi_phys_segments(rbi)) {
                                        rbi->bi_next = return_bi;
                                        return_bi = rbi;
                                }
@@ -812,6 +683,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
                        }
                }
        }
+        spin_unlock_irq(&conf->device_lock);
        clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
        return_io(return_bi);
@@ -823,6 +695,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
 static void ops_run_biofill(struct stripe_head *sh)
 {
        struct dma_async_tx_descriptor *tx = NULL;
+        raid5_conf_t *conf = sh->raid_conf;
        struct async_submit_ctl submit;
        int i;
@@ -833,10 +706,10 @@ static void ops_run_biofill(struct stripe_head *sh)
                struct r5dev *dev = &sh->dev[i];
                if (test_bit(R5_Wantfill, &dev->flags)) {
                        struct bio *rbi;
-                        spin_lock_irq(&sh->stripe_lock);
+                        spin_lock_irq(&conf->device_lock);
                        dev->read = rbi = dev->toread;
                        dev->toread = NULL;
-                        spin_unlock_irq(&sh->stripe_lock);
+                        spin_unlock_irq(&conf->device_lock);
                        while (rbi && rbi->bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                tx = async_copy_data(0, rbi, dev->page,
@@ -1172,24 +1045,19 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
                if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
                        struct bio *wbi;
-                        spin_lock_irq(&sh->stripe_lock);
+                        spin_lock_irq(&sh->raid_conf->device_lock);
                        chosen = dev->towrite;
                        dev->towrite = NULL;
                        BUG_ON(dev->written);
                        wbi = dev->written = chosen;
-                        spin_unlock_irq(&sh->stripe_lock);
+                        spin_unlock_irq(&sh->raid_conf->device_lock);
                        while (wbi && wbi->bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                if (wbi->bi_rw & REQ_FUA)
                                        set_bit(R5_WantFUA, &dev->flags);
-                                if (wbi->bi_rw & REQ_SYNC)
+                                tx = async_copy_data(1, wbi, dev->page,
-                                        set_bit(R5_SyncIO, &dev->flags);
+                                        dev->sector, tx);
-                                if (wbi->bi_rw & REQ_DISCARD)
-                                        set_bit(R5_Discard, &dev->flags);
-                                else
-                                        tx = async_copy_data(1, wbi, dev->page,
-                                                dev->sector, tx);
                                wbi = r5_next_bio(wbi, dev->sector);
                        }
                }
@@ -1205,27 +1073,21 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
        int pd_idx = sh->pd_idx;
        int qd_idx = sh->qd_idx;
        int i;
-        bool fua = false, sync = false, discard = false;
+        bool fua = false;
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
-        for (i = disks; i--; ) {
+        for (i = disks; i--; )
                fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
-                sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
-                discard |= test_bit(R5_Discard, &sh->dev[i].flags);
-        }
        for (i = disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
                if (dev->written || i == pd_idx || i == qd_idx) {
-                        if (!discard)
+                        set_bit(R5_UPTODATE, &dev->flags);
-                                set_bit(R5_UPTODATE, &dev->flags);
                        if (fua)
                                set_bit(R5_WantFUA, &dev->flags);
-                        if (sync)
-                                set_bit(R5_SyncIO, &dev->flags);
                }
        }
@@ -1257,18 +1119,6 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
-        for (i = 0; i < sh->disks; i++) {
-                if (pd_idx == i)
-                        continue;
-                if (!test_bit(R5_Discard, &sh->dev[i].flags))
-                        break;
-        }
-        if (i >= sh->disks) {
-                atomic_inc(&sh->count);
-                set_bit(R5_Discard, &sh->dev[pd_idx].flags);
-                ops_complete_reconstruct(sh);
-                return;
-        }
        /* check if prexor is active which means only process blocks
         * that are part of a read-modify-write (written)
         */
@@ -1313,24 +1163,10 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 {
        struct async_submit_ctl submit;
        struct page **blocks = percpu->scribble;
-        int count, i;
+        int count;
        pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
-        for (i = 0; i < sh->disks; i++) {
-                if (sh->pd_idx == i || sh->qd_idx == i)
-                        continue;
-                if (!test_bit(R5_Discard, &sh->dev[i].flags))
-                        break;
-        }
-        if (i >= sh->disks) {
-                atomic_inc(&sh->count);
-                set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
-                set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
-                ops_complete_reconstruct(sh);
-                return;
-        }
        count = set_syndrome_sources(blocks, sh);
        atomic_inc(&sh->count);
@@ -1410,7 +1246,7 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
        int overlap_clear = 0, i, disks = sh->disks;
        struct dma_async_tx_descriptor *tx = NULL;
-        struct r5conf *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
        int level = conf->level;
        struct raid5_percpu *percpu;
        unsigned long cpu;
@@ -1501,7 +1337,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 #define raid_run_ops __raid_run_ops
 #endif
-static int grow_one_stripe(struct r5conf *conf)
+static int grow_one_stripe(raid5_conf_t *conf)
 {
        struct stripe_head *sh;
        sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1513,8 +1349,6 @@ static int grow_one_stripe(struct r5conf *conf)
        init_waitqueue_head(&sh->ops.wait_for_ops);
        #endif
-        spin_lock_init(&sh->stripe_lock);
        if (grow_buffers(sh)) {
                shrink_buffers(sh);
                kmem_cache_free(conf->slab_cache, sh);
@@ -1528,7 +1362,7 @@ static int grow_one_stripe(struct r5conf *conf)
        return 1;
 }
-static int grow_stripes(struct r5conf *conf, int num)
+static int grow_stripes(raid5_conf_t *conf, int num)
 {
        struct kmem_cache *sc;
        int devs = max(conf->raid_disks, conf->previous_raid_disks);
@@ -1577,7 +1411,7 @@ static size_t scribble_len(int num)
        return len;
 }
-static int resize_stripes(struct r5conf *conf, int newsize)
+static int resize_stripes(raid5_conf_t *conf, int newsize)
 {
        /* Make all the stripes able to hold 'newsize' devices.
         * New slots in each stripe get 'page' set to a new page.
@@ -1585,7 +1419,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         * This happens in stages:
         * 1/ create a new kmem_cache and allocate the required number of
         *    stripe_heads.
-         * 2/ gather all the old stripe_heads and transfer the pages across
+         * 2/ gather all the old stripe_heads and tranfer the pages across
         *    to the new stripe_heads.  This will have the side effect of
         *    freezing the array as once all stripe_heads have been collected,
         *    no IO will be possible.  Old stripe heads are freed once their
@@ -1633,7 +1467,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
                #ifdef CONFIG_MULTICORE_RAID456
                init_waitqueue_head(&nsh->ops.wait_for_ops);
                #endif
-                spin_lock_init(&nsh->stripe_lock);
                list_add(&nsh->lru, &newstripes);
        }
@@ -1655,7 +1488,8 @@ static int resize_stripes(struct r5conf *conf, int newsize)
                spin_lock_irq(&conf->device_lock);
                wait_event_lock_irq(conf->wait_for_stripe,
                                    !list_empty(&conf->inactive_list),
-                                    conf->device_lock);
+                                    conf->device_lock,
+                                    );
                osh = get_free_stripe(conf);
                spin_unlock_irq(&conf->device_lock);
                atomic_set(&nsh->count, 1);
@@ -1722,7 +1556,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        return err;
 }
-static int drop_one_stripe(struct r5conf *conf)
+static int drop_one_stripe(raid5_conf_t *conf)
 {
        struct stripe_head *sh;
@@ -1738,7 +1572,7 @@ static int drop_one_stripe(struct r5conf *conf)
        return 1;
 }
-static void shrink_stripes(struct r5conf *conf)
+static void shrink_stripes(raid5_conf_t *conf)
 {
        while (drop_one_stripe(conf))
                ;
@@ -1751,12 +1585,12 @@ static void shrink_stripes(struct r5conf *conf)
 static void raid5_end_read_request(struct bio * bi, int error)
 {
        struct stripe_head *sh = bi->bi_private;
-        struct r5conf *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks, i;
        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
        char b[BDEVNAME_SIZE];
-        struct md_rdev *rdev = NULL;
+        mdk_rdev_t *rdev;
-        sector_t s;
        for (i=0 ; i<disks; i++)
                if (bi == &sh->dev[i].req)
@@ -1769,77 +1603,52 @@ static void raid5_end_read_request(struct bio * bi, int error)
                BUG();
                return;
        }
-        if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
-                /* If replacement finished while this request was outstanding,
-                 * 'replacement' might be NULL already.
-                 * In that case it moved down to 'rdev'.
-                 * rdev is not removed until all requests are finished.
-                 */
-                rdev = conf->disks[i].replacement;
-        if (!rdev)
-                rdev = conf->disks[i].rdev;
-        if (use_new_offset(conf, sh))
-                s = sh->sector + rdev->new_data_offset;
-        else
-                s = sh->sector + rdev->data_offset;
        if (uptodate) {
                set_bit(R5_UPTODATE, &sh->dev[i].flags);
                if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-                        /* Note that this cannot happen on a
+                        rdev = conf->disks[i].rdev;
-                         * replacement device.  We just fail those on
-                         * any error
-                         */
                        printk_ratelimited(
                                KERN_INFO
                                "md/raid:%s: read error corrected"
                                " (%lu sectors at %llu on %s)\n",
                                mdname(conf->mddev), STRIPE_SECTORS,
-                                (unsigned long long)s,
+                                (unsigned long long)(sh->sector
+                                                     + rdev->data_offset),
                                bdevname(rdev->bdev, b));
                        atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
                        clear_bit(R5_ReadError, &sh->dev[i].flags);
                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
-                } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+                }
-                        clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
+                if (atomic_read(&conf->disks[i].rdev->read_errors))
+                        atomic_set(&conf->disks[i].rdev->read_errors, 0);
-                if (atomic_read(&rdev->read_errors))
-                        atomic_set(&rdev->read_errors, 0);
        } else {
-                const char *bdn = bdevname(rdev->bdev, b);
+                const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
                int retry = 0;
-                int set_bad = 0;
+                rdev = conf->disks[i].rdev;
                clear_bit(R5_UPTODATE, &sh->dev[i].flags);
                atomic_inc(&rdev->read_errors);
-                if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
+                if (conf->mddev->degraded >= conf->max_degraded)
-                        printk_ratelimited(
-                                KERN_WARNING
-                                "md/raid:%s: read error on replacement device "
-                                "(sector %llu on %s).\n",
-                                mdname(conf->mddev),
-                                (unsigned long long)s,
-                                bdn);
-                else if (conf->mddev->degraded >= conf->max_degraded) {
-                        set_bad = 1;
                        printk_ratelimited(
                                KERN_WARNING
                                "md/raid:%s: read error not correctable "
                                "(sector %llu on %s).\n",
                                mdname(conf->mddev),
-                                (unsigned long long)s,
+                                (unsigned long long)(sh->sector
+                                                     + rdev->data_offset),
                                bdn);
-                } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
+                else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
                        /* Oh, no!!! */
-                        set_bad = 1;
                        printk_ratelimited(
                                KERN_WARNING
                                "md/raid:%s: read error NOT corrected!! "
                                "(sector %llu on %s).\n",
                                mdname(conf->mddev),
-                                (unsigned long long)s,
+                                (unsigned long long)(sh->sector
+                                                     + rdev->data_offset),
                                bdn);
-                } else if (atomic_read(&rdev->read_errors)
+                else if (atomic_read(&rdev->read_errors)
                         > conf->max_nr_stripes)
                        printk(KERN_WARNING
                               "md/raid:%s: Too many read errors, failing device %s.\n",
@@ -1847,22 +1656,14 @@ static void raid5_end_read_request(struct bio * bi, int error)
                else
                        retry = 1;
                if (retry)
-                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
+                        set_bit(R5_ReadError, &sh->dev[i].flags);
-                                set_bit(R5_ReadError, &sh->dev[i].flags);
-                                clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
-                        } else
-                                set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
                else {
                        clear_bit(R5_ReadError, &sh->dev[i].flags);
                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
-                        if (!(set_bad
+                        md_error(conf->mddev, rdev);
-                              && test_bit(In_sync, &rdev->flags)
-                              && rdev_set_badblocks(
-                                      rdev, sh->sector, STRIPE_SECTORS, 0)))
-                                md_error(conf->mddev, rdev);
                }
        }
-        rdev_dec_pending(rdev, conf->mddev);
+        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
        clear_bit(R5_LOCKED, &sh->dev[i].flags);
        set_bit(STRIPE_HANDLE, &sh->state);
        release_stripe(sh);
@@ -1871,32 +1672,16 @@ static void raid5_end_read_request(struct bio * bi, int error)
 static void raid5_end_write_request(struct bio *bi, int error)
 {
        struct stripe_head *sh = bi->bi_private;
-        struct r5conf *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks, i;
-        struct md_rdev *uninitialized_var(rdev);
        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
        sector_t first_bad;
        int bad_sectors;
-        int replacement = 0;
-        for (i = 0 ; i < disks; i++) {
+        for (i=0 ; i<disks; i++)
-                if (bi == &sh->dev[i].req) {
+                if (bi == &sh->dev[i].req)
-                        rdev = conf->disks[i].rdev;
-                        break;
-                }
-                if (bi == &sh->dev[i].rreq) {
-                        rdev = conf->disks[i].replacement;
-                        if (rdev)
-                                replacement = 1;
-                        else
-                                /* rdev was removed and 'replacement'
-                                 * replaced it.  rdev is not removed
-                                 * until all requests are finished.
-                                 */
-                                rdev = conf->disks[i].rdev;
                        break;
-                }
-        }
        pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
                (unsigned long long)sh->sector, i, atomic_read(&sh->count),
                uptodate);
@@ -1905,33 +1690,21 @@ static void raid5_end_write_request(struct bio *bi, int error)
                return;
        }
-        if (replacement) {
+        if (!uptodate) {
-                if (!uptodate)
+                set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
-                        md_error(conf->mddev, rdev);
+                set_bit(R5_WriteError, &sh->dev[i].flags);
-                else if (is_badblock(rdev, sh->sector,
+        } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS,
-                                     STRIPE_SECTORS,
+                               &first_bad, &bad_sectors))
-                                     &first_bad, &bad_sectors))
+                set_bit(R5_MadeGood, &sh->dev[i].flags);
-                        set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
-        } else {
-                if (!uptodate) {
-                        set_bit(WriteErrorSeen, &rdev->flags);
-                        set_bit(R5_WriteError, &sh->dev[i].flags);
-                        if (!test_and_set_bit(WantReplacement, &rdev->flags))
-                                set_bit(MD_RECOVERY_NEEDED,
-                                        &rdev->mddev->recovery);
-                } else if (is_badblock(rdev, sh->sector,
-                                       STRIPE_SECTORS,
-                                       &first_bad, &bad_sectors))
-                        set_bit(R5_MadeGood, &sh->dev[i].flags);
-        }
-        rdev_dec_pending(rdev, conf->mddev);
-        if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
+        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-                clear_bit(R5_LOCKED, &sh->dev[i].flags);
+        
+        clear_bit(R5_LOCKED, &sh->dev[i].flags);
        set_bit(STRIPE_HANDLE, &sh->state);
        release_stripe(sh);
 }
 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
        
 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
@@ -1942,33 +1715,33 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
        dev->req.bi_io_vec = &dev->vec;
        dev->req.bi_vcnt++;
        dev->req.bi_max_vecs++;
-        dev->req.bi_private = sh;
        dev->vec.bv_page = dev->page;
+        dev->vec.bv_len = STRIPE_SIZE;
+        dev->vec.bv_offset = 0;
-        bio_init(&dev->rreq);
+        dev->req.bi_sector = sh->sector;
-        dev->rreq.bi_io_vec = &dev->rvec;
+        dev->req.bi_private = sh;
-        dev->rreq.bi_vcnt++;
-        dev->rreq.bi_max_vecs++;
-        dev->rreq.bi_private = sh;
-        dev->rvec.bv_page = dev->page;
        dev->flags = 0;
        dev->sector = compute_blocknr(sh, i, previous);
 }
-static void error(struct mddev *mddev, struct md_rdev *rdev)
+static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        char b[BDEVNAME_SIZE];
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
-        unsigned long flags;
        pr_debug("raid456: error called\n");
-        spin_lock_irqsave(&conf->device_lock, flags);
+        if (test_and_clear_bit(In_sync, &rdev->flags)) {
-        clear_bit(In_sync, &rdev->flags);
+                unsigned long flags;
-        mddev->degraded = calc_degraded(conf);
+                spin_lock_irqsave(&conf->device_lock, flags);
-        spin_unlock_irqrestore(&conf->device_lock, flags);
+                mddev->degraded++;
-        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+                spin_unlock_irqrestore(&conf->device_lock, flags);
+                /*
+                 * if recovery was running, make sure it aborts.
+                 */
+                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+        }
        set_bit(Blocked, &rdev->flags);
        set_bit(Faulty, &rdev->flags);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1985,7 +1758,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 * Input: a 'big' sector number,
 * Output: index of the data and parity disk, and the sector # in them.
 */
-static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
+static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
                                     int previous, int *dd_idx,
                                     struct stripe_head *sh)
 {
@@ -2190,7 +1963,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 {
-        struct r5conf *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
        int raid_disks = sh->disks;
        int data_disks = raid_disks - conf->max_degraded;
        sector_t new_sector = sh->sector, check;
@@ -2315,7 +2088,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                         int rcw, int expand)
 {
        int i, pd_idx = sh->pd_idx, disks = sh->disks;
-        struct r5conf *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
        int level = conf->level;
        if (rcw) {
@@ -2400,25 +2173,18 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
 {
        struct bio **bip;
-        struct r5conf *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
        int firstwrite=0;
        pr_debug("adding bi b#%llu to stripe s#%llu\n",
                (unsigned long long)bi->bi_sector,
                (unsigned long long)sh->sector);
-        /*
-         * If several bio share a stripe. The bio bi_phys_segments acts as a
+        spin_lock_irq(&conf->device_lock);
-         * reference count to avoid race. The reference count should already be
-         * increased before this function is called (for example, in
-         * make_request()), so other bio sharing this stripe will not free the
-         * stripe. If a stripe is owned by one stripe, the stripe lock will
-         * protect it.
-         */
-        spin_lock_irq(&sh->stripe_lock);
        if (forwrite) {
                bip = &sh->dev[dd_idx].towrite;
-                if (*bip == NULL)
+                if (*bip == NULL && sh->dev[dd_idx].written == NULL)
                        firstwrite = 1;
        } else
                bip = &sh->dev[dd_idx].toread;
@@ -2434,7 +2200,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
        if (*bip)
                bi->bi_next = *bip;
        *bip = bi;
-        raid5_inc_bi_active_stripes(bi);
+        bi->bi_phys_segments++;
        if (forwrite) {
                /* check if page is covered */
@@ -2449,11 +2215,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
                        set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
        }
+        spin_unlock_irq(&conf->device_lock);
        pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
                (unsigned long long)(*bip)->bi_sector,
                (unsigned long long)sh->sector, dd_idx);
-        spin_unlock_irq(&sh->stripe_lock);
        if (conf->mddev->bitmap && firstwrite) {
                bitmap_startwrite(conf->mddev->bitmap, sh->sector,
@@ -2465,13 +2231,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 overlap:
        set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
-        spin_unlock_irq(&sh->stripe_lock);
+        spin_unlock_irq(&conf->device_lock);
        return 0;
 }
-static void end_reshape(struct r5conf *conf);
+static void end_reshape(raid5_conf_t *conf);
-static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
+static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
                            struct stripe_head *sh)
 {
        int sectors_per_chunk =
@@ -2488,7 +2254,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 }
 static void
-handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
+handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
                                struct stripe_head_state *s, int disks,
                                struct bio **return_bi)
 {
@@ -2498,7 +2264,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                int bitmap_end = 0;
                if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-                        struct md_rdev *rdev;
+                        mdk_rdev_t *rdev;
                        rcu_read_lock();
                        rdev = rcu_dereference(conf->disks[i].rdev);
                        if (rdev && test_bit(In_sync, &rdev->flags))
@@ -2515,13 +2281,14 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                                rdev_dec_pending(rdev, conf->mddev);
                        }
                }
-                spin_lock_irq(&sh->stripe_lock);
+                spin_lock_irq(&conf->device_lock);
                /* fail all writes first */
                bi = sh->dev[i].towrite;
                sh->dev[i].towrite = NULL;
-                spin_unlock_irq(&sh->stripe_lock);
+                if (bi) {
-                if (bi)
+                        s->to_write--;
                        bitmap_end = 1;
+                }
                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                        wake_up(&conf->wait_for_overlap);
@@ -2530,17 +2297,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                        sh->dev[i].sector + STRIPE_SECTORS) {
                        struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                        if (!raid5_dec_bi_active_stripes(bi)) {
+                        if (!raid5_dec_bi_phys_segments(bi)) {
                                md_write_end(conf->mddev);
                                bi->bi_next = *return_bi;
                                *return_bi = bi;
                        }
                        bi = nextbi;
                }
-                if (bitmap_end)
-                        bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-                                STRIPE_SECTORS, 0, 0);
-                bitmap_end = 0;
                /* and fail all 'written' */
                bi = sh->dev[i].written;
                sh->dev[i].written = NULL;
@@ -2549,7 +2312,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                       sh->dev[i].sector + STRIPE_SECTORS) {
                        struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                        if (!raid5_dec_bi_active_stripes(bi)) {
+                        if (!raid5_dec_bi_phys_segments(bi)) {
                                md_write_end(conf->mddev);
                                bi->bi_next = *return_bi;
                                *return_bi = bi;
@@ -2563,24 +2326,24 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
                    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
                      test_bit(R5_ReadError, &sh->dev[i].flags))) {
-                        spin_lock_irq(&sh->stripe_lock);
                        bi = sh->dev[i].toread;
                        sh->dev[i].toread = NULL;
-                        spin_unlock_irq(&sh->stripe_lock);
                        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                                wake_up(&conf->wait_for_overlap);
+                        if (bi) s->to_read--;
                        while (bi && bi->bi_sector <
                               sh->dev[i].sector + STRIPE_SECTORS) {
                                struct bio *nextbi =
                                        r5_next_bio(bi, sh->dev[i].sector);
                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                                if (!raid5_dec_bi_active_stripes(bi)) {
+                                if (!raid5_dec_bi_phys_segments(bi)) {
                                        bi->bi_next = *return_bi;
                                        *return_bi = bi;
                                }
                                bi = nextbi;
                        }
                }
+                spin_unlock_irq(&conf->device_lock);
                if (bitmap_end)
                        bitmap_endwrite(conf->mddev->bitmap, sh->sector,
                                        STRIPE_SECTORS, 0, 0);
@@ -2596,63 +2359,38 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 }
 static void
-handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
+handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh,
                   struct stripe_head_state *s)
 {
        int abort = 0;
        int i;
+        md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
        clear_bit(STRIPE_SYNCING, &sh->state);
        s->syncing = 0;
-        s->replacing = 0;
        /* There is nothing more to do for sync/check/repair.
-         * Don't even need to abort as that is handled elsewhere
+         * For recover we need to record a bad block on all
-         * if needed, and not always wanted e.g. if there is a known
-         * bad block here.
-         * For recover/replace we need to record a bad block on all
         * non-sync devices, or abort the recovery
         */
-        if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
+        if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
-                /* During recovery devices cannot be removed, so
+                return;
-                 * locking and refcounting of rdevs is not needed
+        /* During recovery devices cannot be removed, so locking and
-                 */
+         * refcounting of rdevs is not needed
-                for (i = 0; i < conf->raid_disks; i++) {
+         */
-                        struct md_rdev *rdev = conf->disks[i].rdev;
+        for (i = 0; i < conf->raid_disks; i++) {
-                        if (rdev
+                mdk_rdev_t *rdev = conf->disks[i].rdev;
-                            && !test_bit(Faulty, &rdev->flags)
+                if (!rdev
-                            && !test_bit(In_sync, &rdev->flags)
+                    || test_bit(Faulty, &rdev->flags)
-                            && !rdev_set_badblocks(rdev, sh->sector,
+                    || test_bit(In_sync, &rdev->flags))
-                                                   STRIPE_SECTORS, 0))
+                        continue;
-                                abort = 1;
+                if (!rdev_set_badblocks(rdev, sh->sector,
-                        rdev = conf->disks[i].replacement;
+                                        STRIPE_SECTORS, 0))
-                        if (rdev
+                        abort = 1;
-                            && !test_bit(Faulty, &rdev->flags)
+        }
-                            && !test_bit(In_sync, &rdev->flags)
+        if (abort) {
-                            && !rdev_set_badblocks(rdev, sh->sector,
+                conf->recovery_disabled = conf->mddev->recovery_disabled;
-                                                   STRIPE_SECTORS, 0))
+                set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
-                                abort = 1;
-                }
-                if (abort)
-                        conf->recovery_disabled =
-                                conf->mddev->recovery_disabled;
        }
-        md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
-}
-static int want_replace(struct stripe_head *sh, int disk_idx)
-{
-        struct md_rdev *rdev;
-        int rv = 0;
-        /* Doing recovery so rcu locking not required */
-        rdev = sh->raid_conf->disks[disk_idx].replacement;
-        if (rdev
-            && !test_bit(Faulty, &rdev->flags)
-            && !test_bit(In_sync, &rdev->flags)
-            && (rdev->recovery_offset <= sh->sector
-                || rdev->mddev->recovery_cp <= sh->sector))
-                rv = 1;
-        return rv;
 }
 /* fetch_block - checks the given member device to see if its data needs
@@ -2674,7 +2412,6 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
            (dev->toread ||
             (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
             s->syncing || s->expanding ||
-             (s->replacing && want_replace(sh, disk_idx)) ||
             (s->failed >= 1 && fdev[0]->toread) ||
             (s->failed >= 2 && fdev[1]->toread) ||
             (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
@@ -2771,7 +2508,7 @@ static void handle_stripe_fill(struct stripe_head *sh,
 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
 * never LOCKED, so we don't need to test 'failed' directly.
 */
-static void handle_stripe_clean_event(struct r5conf *conf,
+static void handle_stripe_clean_event(raid5_conf_t *conf,
        struct stripe_head *sh, int disks, struct bio **return_bi)
 {
        int i;
@@ -2781,63 +2518,53 @@ static void handle_stripe_clean_event(struct r5conf *conf,
                if (sh->dev[i].written) {
                        dev = &sh->dev[i];
                        if (!test_bit(R5_LOCKED, &dev->flags) &&
-                            (test_bit(R5_UPTODATE, &dev->flags) ||
+                                test_bit(R5_UPTODATE, &dev->flags)) {
-                             test_bit(R5_Discard, &dev->flags))) {
                                /* We can return any write requests */
                                struct bio *wbi, *wbi2;
+                                int bitmap_end = 0;
                                pr_debug("Return write for disc %d\n", i);
-                                if (test_and_clear_bit(R5_Discard, &dev->flags))
+                                spin_lock_irq(&conf->device_lock);
-                                        clear_bit(R5_UPTODATE, &dev->flags);
                                wbi = dev->written;
                                dev->written = NULL;
                                while (wbi && wbi->bi_sector <
                                        dev->sector + STRIPE_SECTORS) {
                                        wbi2 = r5_next_bio(wbi, dev->sector);
-                                        if (!raid5_dec_bi_active_stripes(wbi)) {
+                                        if (!raid5_dec_bi_phys_segments(wbi)) {
                                                md_write_end(conf->mddev);
                                                wbi->bi_next = *return_bi;
                                                *return_bi = wbi;
                                        }
                                        wbi = wbi2;
                                }
-                                bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+                                if (dev->towrite == NULL)
-                                                STRIPE_SECTORS,
+                                        bitmap_end = 1;
+                                spin_unlock_irq(&conf->device_lock);
+                                if (bitmap_end)
+                                        bitmap_endwrite(conf->mddev->bitmap,
+                                                        sh->sector,
+                                                        STRIPE_SECTORS,
                                         !test_bit(STRIPE_DEGRADED, &sh->state),
-                                                0);
+                                                        0);
                        }
-                } else if (test_bit(R5_Discard, &sh->dev[i].flags))
+                }
-                        clear_bit(R5_Discard, &sh->dev[i].flags);
        if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
                if (atomic_dec_and_test(&conf->pending_full_writes))
                        md_wakeup_thread(conf->mddev->thread);
 }
-static void handle_stripe_dirtying(struct r5conf *conf,
+static void handle_stripe_dirtying(raid5_conf_t *conf,
                                   struct stripe_head *sh,
                                   struct stripe_head_state *s,
                                   int disks)
 {
        int rmw = 0, rcw = 0, i;
-        sector_t recovery_cp = conf->mddev->recovery_cp;
+        if (conf->max_degraded == 2) {
+                /* RAID6 requires 'rcw' in current implementation
-        /* RAID6 requires 'rcw' in current implementation.
+                 * Calculate the real rcw later - for now fake it
-         * Otherwise, check whether resync is now happening or should start.
-         * If yes, then the array is dirty (after unclean shutdown or
-         * initial creation), so parity in some stripes might be inconsistent.
-         * In this case, we need to always do reconstruct-write, to ensure
-         * that in case of drive failure or read-error correction, we
-         * generate correct data from the parity.
-         */
-        if (conf->max_degraded == 2 ||
-            (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
-                /* Calculate the real rcw later - for now make it
                 * look like rcw is cheaper
                 */
                rcw = 1; rmw = 2;
-                pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
-                         conf->max_degraded, (unsigned long long)recovery_cp,
-                         (unsigned long long)sh->sector);
        } else for (i = disks; i--; ) {
                /* would I have to read this buffer for read_modify_write */
                struct r5dev *dev = &sh->dev[i];
@@ -2863,10 +2590,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
        pr_debug("for sector %llu, rmw=%d rcw=%d\n",
                (unsigned long long)sh->sector, rmw, rcw);
        set_bit(STRIPE_HANDLE, &sh->state);
-        if (rmw < rcw && rmw > 0) {
+        if (rmw < rcw && rmw > 0)
                /* prefer read-modify-write, but need to get some data */
-                blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d",
-                                  (unsigned long long)sh->sector, rmw);
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
                        if ((dev->towrite || i == sh->pd_idx) &&
@@ -2877,7 +2602,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                                if (
                                  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
                                        pr_debug("Read_old block "
-                                                 "%d for r-m-w\n", i);
+                                                "%d for r-m-w\n", i);
                                        set_bit(R5_LOCKED, &dev->flags);
                                        set_bit(R5_Wantread, &dev->flags);
                                        s->locked++;
@@ -2887,10 +2612,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                                }
                        }
                }
-        }
        if (rcw <= rmw && rcw > 0) {
                /* want reconstruct write, but need to get some data */
-                int qread =0;
                rcw = 0;
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
@@ -2909,17 +2632,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                                        set_bit(R5_LOCKED, &dev->flags);
                                        set_bit(R5_Wantread, &dev->flags);
                                        s->locked++;
-                                        qread++;
                                } else {
                                        set_bit(STRIPE_DELAYED, &sh->state);
                                        set_bit(STRIPE_HANDLE, &sh->state);
                                }
                        }
                }
-                if (rcw)
-                        blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
-                                          (unsigned long long)sh->sector,
-                                          rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
        }
        /* now if nothing is locked, and if we have enough data,
         * we can start a write request
@@ -2937,7 +2655,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                schedule_reconstruction(sh, s, rcw == 0, 0);
 }
-static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
+static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
                                struct stripe_head_state *s, int disks)
 {
        struct r5dev *dev = NULL;
@@ -2998,7 +2716,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
                         */
                        set_bit(STRIPE_INSYNC, &sh->state);
                else {
-                        atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
+                        conf->mddev->resync_mismatches += STRIPE_SECTORS;
                        if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
                                /* don't try to repair!! */
                                set_bit(STRIPE_INSYNC, &sh->state);
@@ -3025,7 +2743,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
 }
-static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
+static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
                                  struct stripe_head_state *s,
                                  int disks)
 {
@@ -3150,7 +2868,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
                                 */
                        }
                } else {
-                        atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
+                        conf->mddev->resync_mismatches += STRIPE_SECTORS;
                        if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
                                /* don't try to repair!! */
                                set_bit(STRIPE_INSYNC, &sh->state);
@@ -3188,7 +2906,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
        }
 }
-static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
+static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh)
 {
        int i;
@@ -3241,33 +2959,40 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
                }
        /* done submitting copies, wait for them to complete */
-        async_tx_quiesce(&tx);
+        if (tx) {
+                async_tx_ack(tx);
+                dma_wait_for_async_tx(tx);
+        }
 }
 /*
 * handle_stripe - do things to a stripe.
 *
- * We lock the stripe by setting STRIPE_ACTIVE and then examine the
+ * We lock the stripe and then examine the state of various bits
- * state of various bits to see what needs to be done.
+ * to see what needs to be done.
 * Possible results:
- *    return some read requests which now have data
+ *    return some read request which now have data
- *    return some write requests which are safely on storage
+ *    return some write requests which are safely on disc
 *    schedule a read on some buffers
 *    schedule a write of some buffers
 *    return confirmation of parity correctness
 *
+ * buffers are taken off read_list or write_list, and bh_cache buffers
+ * get BH_Lock set before the stripe lock is released.
+ *
 */
 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 {
-        struct r5conf *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks;
        struct r5dev *dev;
        int i;
-        int do_recovery = 0;
        memset(s, 0, sizeof(*s));
+        s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
        s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
        s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
        s->failed_num[0] = -1;
@@ -3275,8 +3000,9 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
        /* Now to look around and see what can be done */
        rcu_read_lock();
+        spin_lock_irq(&conf->device_lock);
        for (i=disks; i--; ) {
-                struct md_rdev *rdev;
+                mdk_rdev_t *rdev;
                sector_t first_bad;
                int bad_sectors;
                int is_bad = 0;
@@ -3284,8 +3010,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                dev = &sh->dev[i];
                pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
-                         i, dev->flags,
+                        i, dev->flags, dev->toread, dev->towrite, dev->written);
-                         dev->toread, dev->towrite, dev->written);
                /* maybe we can reply to a read
                 *
                 * new wantfill requests are only permitted while
@@ -3316,23 +3041,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                }
                if (dev->written)
                        s->written++;
-                /* Prefer to use the replacement for reads, but only
+                rdev = rcu_dereference(conf->disks[i].rdev);
-                 * if it is recovered enough and has no bad blocks.
-                 */
-                rdev = rcu_dereference(conf->disks[i].replacement);
-                if (rdev && !test_bit(Faulty, &rdev->flags) &&
-                    rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
-                    !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
-                                 &first_bad, &bad_sectors))
-                        set_bit(R5_ReadRepl, &dev->flags);
-                else {
-                        if (rdev)
-                                set_bit(R5_NeedReplace, &dev->flags);
-                        rdev = rcu_dereference(conf->disks[i].rdev);
-                        clear_bit(R5_ReadRepl, &dev->flags);
-                }
-                if (rdev && test_bit(Faulty, &rdev->flags))
-                        rdev = NULL;
                if (rdev) {
                        is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
                                             &first_bad, &bad_sectors);
@@ -3351,8 +3060,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                        /* Not in-sync */;
                else if (is_bad) {
                        /* also not in-sync */
-                        if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+                        if (!test_bit(WriteErrorSeen, &rdev->flags)) {
-                            test_bit(R5_UPTODATE, &dev->flags)) {
                                /* treat as in-sync, but with a read error
                                 * which we can now try to correct
                                 */
@@ -3361,50 +3069,26 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                        }
                } else if (test_bit(In_sync, &rdev->flags))
                        set_bit(R5_Insync, &dev->flags);
-                else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+                else if (!test_bit(Faulty, &rdev->flags)) {
                        /* in sync if before recovery_offset */
-                        set_bit(R5_Insync, &dev->flags);
+                        if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
-                else if (test_bit(R5_UPTODATE, &dev->flags) &&
+                                set_bit(R5_Insync, &dev->flags);
-                         test_bit(R5_Expanded, &dev->flags))
+                }
-                        /* If we've reshaped into here, we assume it is Insync.
+                if (test_bit(R5_WriteError, &dev->flags)) {
-                         * We will shortly update recovery_offset to make
+                        clear_bit(R5_Insync, &dev->flags);
-                         * it official.
+                        if (!test_bit(Faulty, &rdev->flags)) {
-                         */
-                        set_bit(R5_Insync, &dev->flags);
-                if (rdev && test_bit(R5_WriteError, &dev->flags)) {
-                        /* This flag does not apply to '.replacement'
-                         * only to .rdev, so make sure to check that*/
-                        struct md_rdev *rdev2 = rcu_dereference(
-                                conf->disks[i].rdev);
-                        if (rdev2 == rdev)
-                                clear_bit(R5_Insync, &dev->flags);
-                        if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
                                s->handle_bad_blocks = 1;
-                                atomic_inc(&rdev2->nr_pending);
+                                atomic_inc(&rdev->nr_pending);
                        } else
                                clear_bit(R5_WriteError, &dev->flags);
                }
-                if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
+                if (test_bit(R5_MadeGood, &dev->flags)) {
-                        /* This flag does not apply to '.replacement'
+                        if (!test_bit(Faulty, &rdev->flags)) {
-                         * only to .rdev, so make sure to check that*/
-                        struct md_rdev *rdev2 = rcu_dereference(
-                                conf->disks[i].rdev);
-                        if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
                                s->handle_bad_blocks = 1;
-                                atomic_inc(&rdev2->nr_pending);
+                                atomic_inc(&rdev->nr_pending);
                        } else
                                clear_bit(R5_MadeGood, &dev->flags);
                }
-                if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
-                        struct md_rdev *rdev2 = rcu_dereference(
-                                conf->disks[i].replacement);
-                        if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
-                                s->handle_bad_blocks = 1;
-                                atomic_inc(&rdev2->nr_pending);
-                        } else
-                                clear_bit(R5_MadeGoodRepl, &dev->flags);
-                }
                if (!test_bit(R5_Insync, &dev->flags)) {
                        /* The ReadError flag will just be confusing now */
                        clear_bit(R5_ReadError, &dev->flags);
@@ -3416,33 +3100,16 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                        if (s->failed < 2)
                                s->failed_num[s->failed] = i;
                        s->failed++;
-                        if (rdev && !test_bit(Faulty, &rdev->flags))
-                                do_recovery = 1;
                }
        }
-        if (test_bit(STRIPE_SYNCING, &sh->state)) {
+        spin_unlock_irq(&conf->device_lock);
-                /* If there is a failed device being replaced,
-                 *     we must be recovering.
-                 * else if we are after recovery_cp, we must be syncing
-                 * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
-                 * else we can only be replacing
-                 * sync and recovery both need to read all devices, and so
-                 * use the same flag.
-                 */
-                if (do_recovery ||
-                    sh->sector >= conf->mddev->recovery_cp ||
-                    test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
-                        s->syncing = 1;
-                else
-                        s->replacing = 1;
-        }
        rcu_read_unlock();
 }
 static void handle_stripe(struct stripe_head *sh)
 {
        struct stripe_head_state s;
-        struct r5conf *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
        int i;
        int prexor;
        int disks = sh->disks;
@@ -3477,7 +3144,7 @@ static void handle_stripe(struct stripe_head *sh)
        if (unlikely(s.blocked_rdev)) {
                if (s.syncing || s.expanding || s.expanded ||
-                    s.replacing || s.to_write || s.written) {
+                    s.to_write || s.written) {
                        set_bit(STRIPE_HANDLE, &sh->state);
                        goto finish;
                }
@@ -3503,10 +3170,40 @@ static void handle_stripe(struct stripe_head *sh)
                sh->reconstruct_state = 0;
                if (s.to_read+s.to_write+s.written)
                        handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
-                if (s.syncing + s.replacing)
+                if (s.syncing)
                        handle_failed_sync(conf, sh, &s);
        }
+        /*
+         * might be able to return some write requests if the parity blocks
+         * are safe, or on a failed drive
+         */
+        pdev = &sh->dev[sh->pd_idx];
+        s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
+                || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
+        qdev = &sh->dev[sh->qd_idx];
+        s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
+                || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
+                || conf->level < 6;
+        if (s.written &&
+            (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
+                             && !test_bit(R5_LOCKED, &pdev->flags)
+                             && test_bit(R5_UPTODATE, &pdev->flags)))) &&
+            (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
+                             && !test_bit(R5_LOCKED, &qdev->flags)
+                             && test_bit(R5_UPTODATE, &qdev->flags)))))
+                handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+        /* Now we might consider reading some blocks, either to check/generate
+         * parity, or to satisfy requests
+         * or to load a block that is being partially written.
+         */
+        if (s.to_read || s.non_overwrite
+            || (conf->level == 6 && s.to_write && s.failed)
+            || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
+                handle_stripe_fill(sh, &s, disks);
        /* Now we check to see if any write operations have recently
         * completed
         */
@@ -3520,11 +3217,9 @@ static void handle_stripe(struct stripe_head *sh)
                /* All the 'written' buffers and the parity block are ready to
                 * be written back to disk
                 */
-                BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
+                BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
-                       !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
                BUG_ON(sh->qd_idx >= 0 &&
-                       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
+                       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
-                       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
                        if (test_bit(R5_LOCKED, &dev->flags) &&
@@ -3544,40 +3239,6 @@ static void handle_stripe(struct stripe_head *sh)
                        s.dec_preread_active = 1;
        }
-        /*
-         * might be able to return some write requests if the parity blocks
-         * are safe, or on a failed drive
-         */
-        pdev = &sh->dev[sh->pd_idx];
-        s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
-                || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
-        qdev = &sh->dev[sh->qd_idx];
-        s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
-                || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
-                || conf->level < 6;
-        if (s.written &&
-            (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
-                             && !test_bit(R5_LOCKED, &pdev->flags)
-                             && (test_bit(R5_UPTODATE, &pdev->flags) ||
-                                 test_bit(R5_Discard, &pdev->flags))))) &&
-            (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
-                             && !test_bit(R5_LOCKED, &qdev->flags)
-                             && (test_bit(R5_UPTODATE, &qdev->flags) ||
-                                 test_bit(R5_Discard, &qdev->flags))))))
-                handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
-        /* Now we might consider reading some blocks, either to check/generate
-         * parity, or to satisfy requests
-         * or to load a block that is being partially written.
-         */
-        if (s.to_read || s.non_overwrite
-            || (conf->level == 6 && s.to_write && s.failed)
-            || (s.syncing && (s.uptodate + s.compute < disks))
-            || s.replacing
-            || s.expanding)
-                handle_stripe_fill(sh, &s, disks);
        /* Now to consider new write requests and what else, if anything
         * should be read.  We do not handle new writes when:
         * 1/ A 'write' operation (copy+xor) is already in flight.
@@ -3602,20 +3263,7 @@ static void handle_stripe(struct stripe_head *sh)
                        handle_parity_checks5(conf, sh, &s, disks);
        }
-        if (s.replacing && s.locked == 0
+        if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
-            && !test_bit(STRIPE_INSYNC, &sh->state)) {
-                /* Write out to replacement devices where possible */
-                for (i = 0; i < conf->raid_disks; i++)
-                        if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
-                            test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
-                                set_bit(R5_WantReplace, &sh->dev[i].flags);
-                                set_bit(R5_LOCKED, &sh->dev[i].flags);
-                                s.locked++;
-                        }
-                set_bit(STRIPE_INSYNC, &sh->state);
-        }
-        if ((s.syncing || s.replacing) && s.locked == 0 &&
-            test_bit(STRIPE_INSYNC, &sh->state)) {
                md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
                clear_bit(STRIPE_SYNCING, &sh->state);
        }
@@ -3692,22 +3340,12 @@ static void handle_stripe(struct stripe_head *sh)
 finish:
        /* wait for this device to become unblocked */
-        if (unlikely(s.blocked_rdev)) {
+        if (conf->mddev->external && unlikely(s.blocked_rdev))
-                if (conf->mddev->external)
+                md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
-                        md_wait_for_blocked_rdev(s.blocked_rdev,
-                                                 conf->mddev);
-                else
-                        /* Internal metadata will immediately
-                         * be written by raid5d, so we don't
-                         * need to wait here.
-                         */
-                        rdev_dec_pending(s.blocked_rdev,
-                                         conf->mddev);
-        }
        if (s.handle_bad_blocks)
                for (i = disks; i--; ) {
-                        struct md_rdev *rdev;
+                        mdk_rdev_t *rdev;
                        struct r5dev *dev = &sh->dev[i];
                        if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
                                /* We own a safe reference to the rdev */
@@ -3720,16 +3358,7 @@ finish:
                        if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
                                rdev = conf->disks[i].rdev;
                                rdev_clear_badblocks(rdev, sh->sector,
-                                                     STRIPE_SECTORS, 0);
+                                                     STRIPE_SECTORS);
-                                rdev_dec_pending(rdev, conf->mddev);
-                        }
-                        if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
-                                rdev = conf->disks[i].replacement;
-                                if (!rdev)
-                                        /* rdev have been moved down */
-                                        rdev = conf->disks[i].rdev;
-                                rdev_clear_badblocks(rdev, sh->sector,
-                                                     STRIPE_SECTORS, 0);
                                rdev_dec_pending(rdev, conf->mddev);
                        }
                }
@@ -3755,7 +3384,7 @@ finish:
        clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
-static void raid5_activate_delayed(struct r5conf *conf)
+static void raid5_activate_delayed(raid5_conf_t *conf)
 {
        if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
                while (!list_empty(&conf->delayed_list)) {
@@ -3771,7 +3400,7 @@ static void raid5_activate_delayed(struct r5conf *conf)
        }
 }
-static void activate_bit_delay(struct r5conf *conf)
+static void activate_bit_delay(raid5_conf_t *conf)
 {
        /* device_lock is held */
        struct list_head head;
@@ -3785,9 +3414,9 @@ static void activate_bit_delay(struct r5conf *conf)
        }
 }
-int md_raid5_congested(struct mddev *mddev, int bits)
+int md_raid5_congested(mddev_t *mddev, int bits)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        /* No difference between reads and writes.  Just check
         * how busy the stripe_cache is
@@ -3806,7 +3435,7 @@ EXPORT_SYMBOL_GPL(md_raid5_congested);
 static int raid5_congested(void *data, int bits)
 {
-        struct mddev *mddev = data;
+        mddev_t *mddev = data;
        return mddev_congested(mddev, bits) ||
                md_raid5_congested(mddev, bits);
@@ -3819,7 +3448,7 @@ static int raid5_mergeable_bvec(struct request_queue *q,
                                struct bvec_merge_data *bvm,
                                struct bio_vec *biovec)
 {
-        struct mddev *mddev = q->queuedata;
+        mddev_t *mddev = q->queuedata;
        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
        int max;
        unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -3839,7 +3468,7 @@ static int raid5_mergeable_bvec(struct request_queue *q,
 }
-static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
+static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
 {
        sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
        unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -3855,7 +3484,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
 *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
 *  later sampled by raid5d.
 */
-static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
+static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
 {
        unsigned long flags;
@@ -3869,7 +3498,7 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
 }
-static struct bio *remove_bio_from_retry(struct r5conf *conf)
+static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
 {
        struct bio *bi;
@@ -3886,7 +3515,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
                 * this sets the active strip count to 1 and the processed
                 * strip count to zero (upper 8 bits)
                 */
-                raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
+                bi->bi_phys_segments = 1; /* biased count of active stripes */
        }
        return bi;
@@ -3902,10 +3531,10 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
 static void raid5_align_endio(struct bio *bi, int error)
 {
        struct bio* raid_bi  = bi->bi_private;
-        struct mddev *mddev;
+        mddev_t *mddev;
-        struct r5conf *conf;
+        raid5_conf_t *conf;
        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        bio_put(bi);
@@ -3917,8 +3546,6 @@ static void raid5_align_endio(struct bio *bi, int error)
        rdev_dec_pending(rdev, conf->mddev);
        if (!error && uptodate) {
-                trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
-                                         raid_bi, 0);
                bio_endio(raid_bi, 0);
                if (atomic_dec_and_test(&conf->active_aligned_reads))
                        wake_up(&conf->wait_for_stripe);
@@ -3951,13 +3578,12 @@ static int bio_fits_rdev(struct bio *bi)
 }
-static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
+static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        int dd_idx;
        struct bio* align_bi;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
-        sector_t end_sector;
        if (!in_chunk_boundary(mddev, raid_bio)) {
                pr_debug("chunk_aligned_read : non aligned\n");
@@ -3982,19 +3608,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
                                                    0,
                                                    &dd_idx, NULL);
-        end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
        rcu_read_lock();
-        rdev = rcu_dereference(conf->disks[dd_idx].replacement);
+        rdev = rcu_dereference(conf->disks[dd_idx].rdev);
-        if (!rdev || test_bit(Faulty, &rdev->flags) ||
+        if (rdev && test_bit(In_sync, &rdev->flags)) {
-            rdev->recovery_offset < end_sector) {
-                rdev = rcu_dereference(conf->disks[dd_idx].rdev);
-                if (rdev &&
-                    (test_bit(Faulty, &rdev->flags) ||
-                    !(test_bit(In_sync, &rdev->flags) ||
-                      rdev->recovery_offset >= end_sector)))
-                        rdev = NULL;
-        }
-        if (rdev) {
                sector_t first_bad;
                int bad_sectors;
@@ -4003,6 +3619,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
                raid_bio->bi_next = (void*)rdev;
                align_bi->bi_bdev =  rdev->bdev;
                align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
+                align_bi->bi_sector += rdev->data_offset;
                if (!bio_fits_rdev(align_bi) ||
                    is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
@@ -4013,19 +3630,13 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
                        return 0;
                }
-                /* No reshape active, so we can trust rdev->data_offset */
-                align_bi->bi_sector += rdev->data_offset;
                spin_lock_irq(&conf->device_lock);
                wait_event_lock_irq(conf->wait_for_stripe,
                                    conf->quiesce == 0,
-                                    conf->device_lock);
+                                    conf->device_lock, /* nothing */);
                atomic_inc(&conf->active_aligned_reads);
                spin_unlock_irq(&conf->device_lock);
-                trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
-                                      align_bi, disk_devt(mddev->gendisk),
-                                      raid_bio->bi_sector);
                generic_make_request(align_bi);
                return 1;
        } else {
@@ -4045,7 +3656,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 * head of the hold_list has changed, i.e. the head was promoted to the
 * handle_list.
 */
-static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
+static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
 {
        struct stripe_head *sh;
@@ -4088,160 +3699,20 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
        return sh;
 }
-struct raid5_plug_cb {
+static int make_request(mddev_t *mddev, struct bio * bi)
-        struct blk_plug_cb      cb;
-        struct list_head        list;
-};
-static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
-{
-        struct raid5_plug_cb *cb = container_of(
-                blk_cb, struct raid5_plug_cb, cb);
-        struct stripe_head *sh;
-        struct mddev *mddev = cb->cb.data;
-        struct r5conf *conf = mddev->private;
-        int cnt = 0;
-        if (cb->list.next && !list_empty(&cb->list)) {
-                spin_lock_irq(&conf->device_lock);
-                while (!list_empty(&cb->list)) {
-                        sh = list_first_entry(&cb->list, struct stripe_head, lru);
-                        list_del_init(&sh->lru);
-                        /*
-                         * avoid race release_stripe_plug() sees
-                         * STRIPE_ON_UNPLUG_LIST clear but the stripe
-                         * is still in our list
-                         */
-                        smp_mb__before_clear_bit();
-                        clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
-                        __release_stripe(conf, sh);
-                        cnt++;
-                }
-                spin_unlock_irq(&conf->device_lock);
-        }
-        trace_block_unplug(mddev->queue, cnt, !from_schedule);
-        kfree(cb);
-}
-static void release_stripe_plug(struct mddev *mddev,
-                                struct stripe_head *sh)
-{
-        struct blk_plug_cb *blk_cb = blk_check_plugged(
-                raid5_unplug, mddev,
-                sizeof(struct raid5_plug_cb));
-        struct raid5_plug_cb *cb;
-        if (!blk_cb) {
-                release_stripe(sh);
-                return;
-        }
-        cb = container_of(blk_cb, struct raid5_plug_cb, cb);
-        if (cb->list.next == NULL)
-                INIT_LIST_HEAD(&cb->list);
-        if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
-                list_add_tail(&sh->lru, &cb->list);
-        else
-                release_stripe(sh);
-}
-static void make_discard_request(struct mddev *mddev, struct bio *bi)
-{
-        struct r5conf *conf = mddev->private;
-        sector_t logical_sector, last_sector;
-        struct stripe_head *sh;
-        int remaining;
-        int stripe_sectors;
-        if (mddev->reshape_position != MaxSector)
-                /* Skip discard while reshape is happening */
-                return;
-        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-        last_sector = bi->bi_sector + (bi->bi_size>>9);
-        bi->bi_next = NULL;
-        bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
-        stripe_sectors = conf->chunk_sectors *
-                (conf->raid_disks - conf->max_degraded);
-        logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
-                                               stripe_sectors);
-        sector_div(last_sector, stripe_sectors);
-        logical_sector *= conf->chunk_sectors;
-        last_sector *= conf->chunk_sectors;
-        for (; logical_sector < last_sector;
-             logical_sector += STRIPE_SECTORS) {
-                DEFINE_WAIT(w);
-                int d;
-        again:
-                sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
-                prepare_to_wait(&conf->wait_for_overlap, &w,
-                                TASK_UNINTERRUPTIBLE);
-                spin_lock_irq(&sh->stripe_lock);
-                for (d = 0; d < conf->raid_disks; d++) {
-                        if (d == sh->pd_idx || d == sh->qd_idx)
-                                continue;
-                        if (sh->dev[d].towrite || sh->dev[d].toread) {
-                                set_bit(R5_Overlap, &sh->dev[d].flags);
-                                spin_unlock_irq(&sh->stripe_lock);
-                                release_stripe(sh);
-                                schedule();
-                                goto again;
-                        }
-                }
-                finish_wait(&conf->wait_for_overlap, &w);
-                for (d = 0; d < conf->raid_disks; d++) {
-                        if (d == sh->pd_idx || d == sh->qd_idx)
-                                continue;
-                        sh->dev[d].towrite = bi;
-                        set_bit(R5_OVERWRITE, &sh->dev[d].flags);
-                        raid5_inc_bi_active_stripes(bi);
-                }
-                spin_unlock_irq(&sh->stripe_lock);
-                if (conf->mddev->bitmap) {
-                        for (d = 0;
-                             d < conf->raid_disks - conf->max_degraded;
-                             d++)
-                                bitmap_startwrite(mddev->bitmap,
-                                                  sh->sector,
-                                                  STRIPE_SECTORS,
-                                                  0);
-                        sh->bm_seq = conf->seq_flush + 1;
-                        set_bit(STRIPE_BIT_DELAY, &sh->state);
-                }
-                set_bit(STRIPE_HANDLE, &sh->state);
-                clear_bit(STRIPE_DELAYED, &sh->state);
-                if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-                        atomic_inc(&conf->preread_active_stripes);
-                release_stripe_plug(mddev, sh);
-        }
-        remaining = raid5_dec_bi_active_stripes(bi);
-        if (remaining == 0) {
-                md_write_end(mddev);
-                bio_endio(bi, 0);
-        }
-}
-static void make_request(struct mddev *mddev, struct bio * bi)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        int dd_idx;
        sector_t new_sector;
        sector_t logical_sector, last_sector;
        struct stripe_head *sh;
        const int rw = bio_data_dir(bi);
        int remaining;
+        int plugged;
        if (unlikely(bi->bi_rw & REQ_FLUSH)) {
                md_flush_request(mddev, bi);
-                return;
+                return 0;
        }
        md_write_start(mddev, bi);
@@ -4249,24 +3720,22 @@ static void make_request(struct mddev *mddev, struct bio * bi)
        if (rw == READ &&
             mddev->reshape_position == MaxSector &&
             chunk_aligned_read(mddev,bi))
-                return;
+                return 0;
-        if (unlikely(bi->bi_rw & REQ_DISCARD)) {
-                make_discard_request(mddev, bi);
-                return;
-        }
        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
        last_sector = bi->bi_sector + (bi->bi_size>>9);
        bi->bi_next = NULL;
        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
+        plugged = mddev_check_plugged(mddev);
        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
                DEFINE_WAIT(w);
+                int disks, data_disks;
                int previous;
        retry:
                previous = 0;
+                disks = conf->raid_disks;
                prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
                if (unlikely(conf->reshape_progress != MaxSector)) {
                        /* spinlock is needed as reshape_progress may be
@@ -4278,12 +3747,13 @@ static void make_request(struct mddev *mddev, struct bio * bi)
                         * to check again.
                         */
                        spin_lock_irq(&conf->device_lock);
-                        if (mddev->reshape_backwards
+                        if (mddev->delta_disks < 0
                            ? logical_sector < conf->reshape_progress
                            : logical_sector >= conf->reshape_progress) {
+                                disks = conf->previous_raid_disks;
                                previous = 1;
                        } else {
-                                if (mddev->reshape_backwards
+                                if (mddev->delta_disks < 0
                                    ? logical_sector < conf->reshape_safe
                                    : logical_sector >= conf->reshape_safe) {
                                        spin_unlock_irq(&conf->device_lock);
@@ -4293,6 +3763,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
                        }
                        spin_unlock_irq(&conf->device_lock);
                }
+                data_disks = disks - conf->max_degraded;
                new_sector = raid5_compute_sector(conf, logical_sector,
                                                  previous,
@@ -4315,7 +3786,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
                                 */
                                int must_retry = 0;
                                spin_lock_irq(&conf->device_lock);
-                                if (mddev->reshape_backwards
+                                if (mddev->delta_disks < 0
                                    ? logical_sector >= conf->reshape_progress
                                    : logical_sector < conf->reshape_progress)
                                        /* mismatch, need to try again */
@@ -4362,30 +3833,35 @@ static void make_request(struct mddev *mddev, struct bio * bi)
                        if ((bi->bi_rw & REQ_SYNC) &&
                            !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                atomic_inc(&conf->preread_active_stripes);
-                        release_stripe_plug(mddev, sh);
+                        release_stripe(sh);
                } else {
                        /* cannot get stripe for read-ahead, just give-up */
                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
                        finish_wait(&conf->wait_for_overlap, &w);
                        break;
                }
+                        
        }
+        if (!plugged)
+                md_wakeup_thread(mddev->thread);
-        remaining = raid5_dec_bi_active_stripes(bi);
+        spin_lock_irq(&conf->device_lock);
+        remaining = raid5_dec_bi_phys_segments(bi);
+        spin_unlock_irq(&conf->device_lock);
        if (remaining == 0) {
                if ( rw == WRITE )
                        md_write_end(mddev);
-                trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-                                         bi, 0);
                bio_endio(bi, 0);
        }
+        return 0;
 }
-static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
+static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
-static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
+static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
 {
        /* reshaping is quite different to recovery/resync so it is
         * handled quite separately ... here.
@@ -4396,7 +3872,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
         * As the reads complete, handle_stripe will copy the data
         * into the destination stripe and release that stripe.
         */
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        struct stripe_head *sh;
        sector_t first_sector, last_sector;
        int raid_disks = conf->previous_raid_disks;
@@ -4411,11 +3887,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
        if (sector_nr == 0) {
                /* If restarting in the middle, skip the initial sectors */
-                if (mddev->reshape_backwards &&
+                if (mddev->delta_disks < 0 &&
                    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
                        sector_nr = raid5_size(mddev, 0, 0)
                                - conf->reshape_progress;
-                } else if (!mddev->reshape_backwards &&
+                } else if (mddev->delta_disks >= 0 &&
                           conf->reshape_progress > 0)
                        sector_nr = conf->reshape_progress;
                sector_div(sector_nr, new_data_disks);
@@ -4436,11 +3912,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
        else
                reshape_sectors = mddev->chunk_sectors;
-        /* We update the metadata at least every 10 seconds, or when
+        /* we update the metadata when there is more than 3Meg
-         * the data about to be copied would over-write the source of
+         * in the block range (that is rather arbitrary, should
-         * the data at the front of the range.  i.e. one new_stripe
+         * probably be time based) or when the data about to be
-         * along from reshape_progress new_maps to after where
+         * copied would over-write the source of the data at
-         * reshape_safe old_maps to
+         * the front of the range.
+         * i.e. one new_stripe along from reshape_progress new_maps
+         * to after where reshape_safe old_maps to
         */
        writepos = conf->reshape_progress;
        sector_div(writepos, new_data_disks);
@@ -4448,7 +3926,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
        sector_div(readpos, data_disks);
        safepos = conf->reshape_safe;
        sector_div(safepos, data_disks);
-        if (mddev->reshape_backwards) {
+        if (mddev->delta_disks < 0) {
                writepos -= min_t(sector_t, reshape_sectors, writepos);
                readpos += reshape_sectors;
                safepos += reshape_sectors;
@@ -4458,29 +3936,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
                safepos -= min_t(sector_t, reshape_sectors, safepos);
        }
-        /* Having calculated the 'writepos' possibly use it
-         * to set 'stripe_addr' which is where we will write to.
-         */
-        if (mddev->reshape_backwards) {
-                BUG_ON(conf->reshape_progress == 0);
-                stripe_addr = writepos;
-                BUG_ON((mddev->dev_sectors &
-                        ~((sector_t)reshape_sectors - 1))
-                       - reshape_sectors - stripe_addr
-                       != sector_nr);
-        } else {
-                BUG_ON(writepos != sector_nr + reshape_sectors);
-                stripe_addr = sector_nr;
-        }
        /* 'writepos' is the most advanced device address we might write.
         * 'readpos' is the least advanced device address we might read.
         * 'safepos' is the least address recorded in the metadata as having
         *     been reshaped.
-         * If there is a min_offset_diff, these are adjusted either by
+         * If 'readpos' is behind 'writepos', then there is no way that we can
-         * increasing the safepos/readpos if diff is negative, or
-         * increasing writepos if diff is positive.
-         * If 'readpos' is then behind 'writepos', there is no way that we can
         * ensure safety in the face of a crash - that must be done by userspace
         * making a backup of the data.  So in that case there is no particular
         * rush to update metadata.
@@ -4493,13 +3953,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
         * Maybe that number should be configurable, but I'm not sure it is
         * worth it.... maybe it could be a multiple of safemode_delay???
         */
-        if (conf->min_offset_diff < 0) {
+        if ((mddev->delta_disks < 0
-                safepos += -conf->min_offset_diff;
-                readpos += -conf->min_offset_diff;
-        } else
-                writepos += conf->min_offset_diff;
-        if ((mddev->reshape_backwards
             ? (safepos > writepos && readpos < writepos)
             : (safepos < writepos && readpos > writepos)) ||
            time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
@@ -4520,6 +3974,17 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
        }
+        if (mddev->delta_disks < 0) {
+                BUG_ON(conf->reshape_progress == 0);
+                stripe_addr = writepos;
+                BUG_ON((mddev->dev_sectors &
+                        ~((sector_t)reshape_sectors - 1))
+                       - reshape_sectors - stripe_addr
+                       != sector_nr);
+        } else {
+                BUG_ON(writepos != sector_nr + reshape_sectors);
+                stripe_addr = sector_nr;
+        }
        INIT_LIST_HEAD(&stripes);
        for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
                int j;
@@ -4553,7 +4018,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
                list_add(&sh->lru, &stripes);
        }
        spin_lock_irq(&conf->device_lock);
-        if (mddev->reshape_backwards)
+        if (mddev->delta_disks < 0)
                conf->reshape_progress -= reshape_sectors * new_data_disks;
        else
                conf->reshape_progress += reshape_sectors * new_data_disks;
@@ -4614,9 +4079,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 }
 /* FIXME go_faster isn't used */
-static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        struct stripe_head *sh;
        sector_t max_sector = mddev->dev_sectors;
        sector_t sync_blocks;
@@ -4672,6 +4137,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
                return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
        }
        bitmap_cond_end_sync(mddev->bitmap, sector_nr);
        sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
@@ -4700,7 +4166,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
        return STRIPE_SECTORS;
 }
-static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
+static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 {
        /* We may not be able to submit a whole bio at once as there
         * may not be enough stripe_heads available.
@@ -4729,7 +4195,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
                     sector += STRIPE_SECTORS,
                     scnt++) {
-                if (scnt < raid5_bi_processed_stripes(raid_bio))
+                if (scnt < raid5_bi_hw_segments(raid_bio))
                        /* already done this stripe */
                        continue;
@@ -4737,58 +4203,33 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
                if (!sh) {
                        /* failed to get a stripe - must wait */
-                        raid5_set_bi_processed_stripes(raid_bio, scnt);
+                        raid5_set_bi_hw_segments(raid_bio, scnt);
                        conf->retry_read_aligned = raid_bio;
                        return handled;
                }
+                set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
                if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
                        release_stripe(sh);
-                        raid5_set_bi_processed_stripes(raid_bio, scnt);
+                        raid5_set_bi_hw_segments(raid_bio, scnt);
                        conf->retry_read_aligned = raid_bio;
                        return handled;
                }
-                set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
                handle_stripe(sh);
                release_stripe(sh);
                handled++;
        }
-        remaining = raid5_dec_bi_active_stripes(raid_bio);
+        spin_lock_irq(&conf->device_lock);
-        if (remaining == 0) {
+        remaining = raid5_dec_bi_phys_segments(raid_bio);
-                trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+        spin_unlock_irq(&conf->device_lock);
-                                         raid_bio, 0);
+        if (remaining == 0)
                bio_endio(raid_bio, 0);
-        }
        if (atomic_dec_and_test(&conf->active_aligned_reads))
                wake_up(&conf->wait_for_stripe);
        return handled;
 }
-#define MAX_STRIPE_BATCH 8
-static int handle_active_stripes(struct r5conf *conf)
-{
-        struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
-        int i, batch_size = 0;
-        while (batch_size < MAX_STRIPE_BATCH &&
-                        (sh = __get_priority_stripe(conf)) != NULL)
-                batch[batch_size++] = sh;
-        if (batch_size == 0)
-                return batch_size;
-        spin_unlock_irq(&conf->device_lock);
-        for (i = 0; i < batch_size; i++)
-                handle_stripe(batch[i]);
-        cond_resched();
-        spin_lock_irq(&conf->device_lock);
-        for (i = 0; i < batch_size; i++)
-                __release_stripe(conf, batch[i]);
-        return batch_size;
-}
 /*
 * This is our raid5 kernel thread.
@@ -4797,10 +4238,10 @@ static int handle_active_stripes(struct r5conf *conf)
 * During the scan, completed stripes are saved for us by the interrupt
 * handler, so that they will not have to wait for our next wakeup.
 */
-static void raid5d(struct md_thread *thread)
+static void raid5d(mddev_t *mddev)
 {
-        struct mddev *mddev = thread->mddev;
+        struct stripe_head *sh;
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        int handled;
        struct blk_plug plug;
@@ -4813,9 +4254,8 @@ static void raid5d(struct md_thread *thread)
        spin_lock_irq(&conf->device_lock);
        while (1) {
                struct bio *bio;
-                int batch_size;
-                if (
+                if (atomic_read(&mddev->plug_cnt) == 0 &&
                    !list_empty(&conf->bitmap_list)) {
                        /* Now is a good time to flush some bitmap updates */
                        conf->seq_flush++;
@@ -4825,7 +4265,8 @@ static void raid5d(struct md_thread *thread)
                        conf->seq_write = conf->seq_flush;
                        activate_bit_delay(conf);
                }
-                raid5_activate_delayed(conf);
+                if (atomic_read(&mddev->plug_cnt) == 0)
+                        raid5_activate_delayed(conf);
                while ((bio = remove_bio_from_retry(conf))) {
                        int ok;
@@ -4837,16 +4278,21 @@ static void raid5d(struct md_thread *thread)
                        handled++;
                }
-                batch_size = handle_active_stripes(conf);
+                sh = __get_priority_stripe(conf);
-                if (!batch_size)
+                if (!sh)
                        break;
-                handled += batch_size;
+                spin_unlock_irq(&conf->device_lock);
+                
+                handled++;
+                handle_stripe(sh);
+                release_stripe(sh);
+                cond_resched();
-                if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
+                if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
-                        spin_unlock_irq(&conf->device_lock);
                        md_check_recovery(mddev);
-                        spin_lock_irq(&conf->device_lock);
-                }
+                spin_lock_irq(&conf->device_lock);
        }
        pr_debug("%d stripes handled\n", handled);
@@ -4859,9 +4305,9 @@ static void raid5d(struct md_thread *thread)
 }
 static ssize_t
-raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
+raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        if (conf)
                return sprintf(page, "%d\n", conf->max_nr_stripes);
        else
@@ -4869,9 +4315,9 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
 }
 int
-raid5_set_cache_size(struct mddev *mddev, int size)
+raid5_set_cache_size(mddev_t *mddev, int size)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        int err;
        if (size <= 16 || size > 32768)
@@ -4895,9 +4341,9 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 EXPORT_SYMBOL(raid5_set_cache_size);
 static ssize_t
-raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
+raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        unsigned long new;
        int err;
@@ -4920,9 +4366,9 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
                                raid5_store_stripe_cache_size);
 static ssize_t
-raid5_show_preread_threshold(struct mddev *mddev, char *page)
+raid5_show_preread_threshold(mddev_t *mddev, char *page)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        if (conf)
                return sprintf(page, "%d\n", conf->bypass_threshold);
        else
@@ -4930,9 +4376,9 @@ raid5_show_preread_threshold(struct mddev *mddev, char *page)
 }
 static ssize_t
-raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
+raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        unsigned long new;
        if (len >= PAGE_SIZE)
                return -EINVAL;
@@ -4954,9 +4400,9 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
                                        raid5_store_preread_threshold);
 static ssize_t
-stripe_cache_active_show(struct mddev *mddev, char *page)
+stripe_cache_active_show(mddev_t *mddev, char *page)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        if (conf)
                return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
        else
@@ -4978,9 +4424,9 @@ static struct attribute_group raid5_attrs_group = {
 };
 static sector_t
-raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        if (!sectors)
                sectors = mddev->dev_sectors;
@@ -4993,7 +4439,7 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
        return sectors * (raid_disks - conf->max_degraded);
 }
-static void raid5_free_percpu(struct r5conf *conf)
+static void raid5_free_percpu(raid5_conf_t *conf)
 {
        struct raid5_percpu *percpu;
        unsigned long cpu;
@@ -5015,7 +4461,7 @@ static void raid5_free_percpu(struct r5conf *conf)
        free_percpu(conf->percpu);
 }
-static void free_conf(struct r5conf *conf)
+static void free_conf(raid5_conf_t *conf)
 {
        shrink_stripes(conf);
        raid5_free_percpu(conf);
@@ -5028,7 +4474,7 @@ static void free_conf(struct r5conf *conf)
 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
                              void *hcpu)
 {
-        struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
+        raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
        long cpu = (long)hcpu;
        struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
@@ -5063,7 +4509,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
 }
 #endif
-static int raid5_alloc_percpu(struct r5conf *conf)
+static int raid5_alloc_percpu(raid5_conf_t *conf)
 {
        unsigned long cpu;
        struct page *spare_page;
@@ -5105,13 +4551,12 @@ static int raid5_alloc_percpu(struct r5conf *conf)
        return err;
 }
-static struct r5conf *setup_conf(struct mddev *mddev)
+static raid5_conf_t *setup_conf(mddev_t *mddev)
 {
-        struct r5conf *conf;
+        raid5_conf_t *conf;
        int raid_disk, memory, max_disks;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        struct disk_info *disk;
-        char pers_name[6];
        if (mddev->new_level != 5
            && mddev->new_level != 4
@@ -5142,7 +4587,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                return ERR_PTR(-EINVAL);
        }
-        conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
+        conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
        if (conf == NULL)
                goto abort;
        spin_lock_init(&conf->device_lock);
@@ -5157,7 +4602,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        atomic_set(&conf->preread_active_stripes, 0);
        atomic_set(&conf->active_aligned_reads, 0);
        conf->bypass_threshold = BYPASS_THRESHOLD;
-        conf->recovery_disabled = mddev->recovery_disabled - 1;
        conf->raid_disks = mddev->raid_disks;
        if (mddev->reshape_position == MaxSector)
@@ -5183,22 +4627,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        pr_debug("raid456: run(%s) called.\n", mdname(mddev));
-        rdev_for_each(rdev, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                raid_disk = rdev->raid_disk;
                if (raid_disk >= max_disks
                    || raid_disk < 0)
                        continue;
                disk = conf->disks + raid_disk;
-                if (test_bit(Replacement, &rdev->flags)) {
+                disk->rdev = rdev;
-                        if (disk->replacement)
-                                goto abort;
-                        disk->replacement = rdev;
-                } else {
-                        if (disk->rdev)
-                                goto abort;
-                        disk->rdev = rdev;
-                }
                if (test_bit(In_sync, &rdev->flags)) {
                        char b[BDEVNAME_SIZE];
@@ -5235,8 +4671,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
                       mdname(mddev), memory);
-        sprintf(pers_name, "raid%d", mddev->new_level);
+        conf->thread = md_register_thread(raid5d, mddev, NULL);
-        conf->thread = md_register_thread(raid5d, mddev, pers_name);
        if (!conf->thread) {
                printk(KERN_ERR
                       "md/raid:%s: couldn't allocate thread.\n",
@@ -5281,50 +4716,23 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
        return 0;
 }
-static int run(struct mddev *mddev)
+static int run(mddev_t *mddev)
 {
-        struct r5conf *conf;
+        raid5_conf_t *conf;
        int working_disks = 0;
        int dirty_parity_disks = 0;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        sector_t reshape_offset = 0;
-        int i;
-        long long min_offset_diff = 0;
-        int first = 1;
        if (mddev->recovery_cp != MaxSector)
                printk(KERN_NOTICE "md/raid:%s: not clean"
                       " -- starting background reconstruction\n",
                       mdname(mddev));
-        rdev_for_each(rdev, mddev) {
-                long long diff;
-                if (rdev->raid_disk < 0)
-                        continue;
-                diff = (rdev->new_data_offset - rdev->data_offset);
-                if (first) {
-                        min_offset_diff = diff;
-                        first = 0;
-                } else if (mddev->reshape_backwards &&
-                         diff < min_offset_diff)
-                        min_offset_diff = diff;
-                else if (!mddev->reshape_backwards &&
-                         diff > min_offset_diff)
-                        min_offset_diff = diff;
-        }
        if (mddev->reshape_position != MaxSector) {
                /* Check that we can continue the reshape.
-                 * Difficulties arise if the stripe we would write to
+                 * Currently only disks can change, it must
-                 * next is at or after the stripe we would read from next.
+                 * increase, and we must be past the point where
-                 * For a reshape that changes the number of devices, this
+                 * a stripe over-writes itself
-                 * is only possible for a very short time, and mdadm makes
-                 * sure that time appears to have past before assembling
-                 * the array.  So we fail if that time hasn't passed.
-                 * For a reshape that keeps the number of devices the same
-                 * mdadm must be monitoring the reshape can keeping the
-                 * critical areas read-only and backed up.  It will start
-                 * the array in read-only mode, so we check for that.
                 */
                sector_t here_new, here_old;
                int old_disks;
@@ -5356,34 +4764,26 @@ static int run(struct mddev *mddev)
                /* here_old is the first stripe that we might need to read
                 * from */
                if (mddev->delta_disks == 0) {
-                        if ((here_new * mddev->new_chunk_sectors !=
-                             here_old * mddev->chunk_sectors)) {
-                                printk(KERN_ERR "md/raid:%s: reshape position is"
-                                       " confused - aborting\n", mdname(mddev));
-                                return -EINVAL;
-                        }
                        /* We cannot be sure it is safe to start an in-place
-                         * reshape.  It is only safe if user-space is monitoring
+                         * reshape.  It is only safe if user-space if monitoring
                         * and taking constant backups.
                         * mdadm always starts a situation like this in
                         * readonly mode so it can take control before
                         * allowing any writes.  So just check for that.
                         */
-                        if (abs(min_offset_diff) >= mddev->chunk_sectors &&
+                        if ((here_new * mddev->new_chunk_sectors != 
-                            abs(min_offset_diff) >= mddev->new_chunk_sectors)
+                             here_old * mddev->chunk_sectors) ||
-                                /* not really in-place - so OK */;
+                            mddev->ro == 0) {
-                        else if (mddev->ro == 0) {
+                                printk(KERN_ERR "md/raid:%s: in-place reshape must be started"
-                                printk(KERN_ERR "md/raid:%s: in-place reshape "
+                                       " in read-only mode - aborting\n",
-                                       "must be started in read-only mode "
-                                       "- aborting\n",
                                       mdname(mddev));
                                return -EINVAL;
                        }
-                } else if (mddev->reshape_backwards
+                } else if (mddev->delta_disks < 0
-                    ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
+                    ? (here_new * mddev->new_chunk_sectors <=
                       here_old * mddev->chunk_sectors)
                    : (here_new * mddev->new_chunk_sectors >=
-                       here_old * mddev->chunk_sectors + (-min_offset_diff))) {
+                       here_old * mddev->chunk_sectors)) {
                        /* Reading from the same stripe as writing to - bad */
                        printk(KERN_ERR "md/raid:%s: reshape_position too early for "
                               "auto-recovery - aborting.\n",
@@ -5408,30 +4808,16 @@ static int run(struct mddev *mddev)
        if (IS_ERR(conf))
                return PTR_ERR(conf);
-        conf->min_offset_diff = min_offset_diff;
        mddev->thread = conf->thread;
        conf->thread = NULL;
        mddev->private = conf;
-        for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
+        /*
-             i++) {
+         * 0 for a fully functional array, 1 or 2 for a degraded array.
-                rdev = conf->disks[i].rdev;
+         */
-                if (!rdev && conf->disks[i].replacement) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
-                        /* The replacement is all we have yet */
+                if (rdev->raid_disk < 0)
-                        rdev = conf->disks[i].replacement;
-                        conf->disks[i].replacement = NULL;
-                        clear_bit(Replacement, &rdev->flags);
-                        conf->disks[i].rdev = rdev;
-                }
-                if (!rdev)
                        continue;
-                if (conf->disks[i].replacement &&
-                    conf->reshape_progress != MaxSector) {
-                        /* replacements and reshape simply do not mix. */
-                        printk(KERN_ERR "md: cannot handle concurrent "
-                               "replacement and reshape.\n");
-                        goto abort;
-                }
                if (test_bit(In_sync, &rdev->flags)) {
                        working_disks++;
                        continue;
@@ -5465,10 +4851,8 @@ static int run(struct mddev *mddev)
                dirty_parity_disks++;
        }
-        /*
+        mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
-         * 0 for a fully functional array, 1 or 2 for a degraded array.
+                           - working_disks);
-         */
-        mddev->degraded = calc_degraded(conf);
        if (has_failed(conf)) {
                printk(KERN_ERR "md/raid:%s: not enough operational devices"
@@ -5534,7 +4918,6 @@ static int run(struct mddev *mddev)
        if (mddev->queue) {
                int chunk_size;
-                bool discard_supported = true;
                /* read-ahead size must cover two whole stripes, which
                 * is 2 * (datadisks) * chunksize where 'n' is the
                 * number of raid devices
@@ -5554,67 +4937,27 @@ static int run(struct mddev *mddev)
                blk_queue_io_min(mddev->queue, chunk_size);
                blk_queue_io_opt(mddev->queue, chunk_size *
                                 (conf->raid_disks - conf->max_degraded));
-                /*
-                 * We can only discard a whole stripe. It doesn't make sense to
-                 * discard data disk but write parity disk
-                 */
-                stripe = stripe * PAGE_SIZE;
-                /* Round up to power of 2, as discard handling
-                 * currently assumes that */
-                while ((stripe-1) & stripe)
-                        stripe = (stripe | (stripe-1)) + 1;
-                mddev->queue->limits.discard_alignment = stripe;
-                mddev->queue->limits.discard_granularity = stripe;
-                /*
-                 * unaligned part of discard request will be ignored, so can't
-                 * guarantee discard_zerors_data
-                 */
-                mddev->queue->limits.discard_zeroes_data = 0;
-                rdev_for_each(rdev, mddev) {
+                list_for_each_entry(rdev, &mddev->disks, same_set)
                        disk_stack_limits(mddev->gendisk, rdev->bdev,
                                          rdev->data_offset << 9);
-                        disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                          rdev->new_data_offset << 9);
-                        /*
-                         * discard_zeroes_data is required, otherwise data
-                         * could be lost. Consider a scenario: discard a stripe
-                         * (the stripe could be inconsistent if
-                         * discard_zeroes_data is 0); write one disk of the
-                         * stripe (the stripe could be inconsistent again
-                         * depending on which disks are used to calculate
-                         * parity); the disk is broken; The stripe data of this
-                         * disk is lost.
-                         */
-                        if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
-                            !bdev_get_queue(rdev->bdev)->
-                                                limits.discard_zeroes_data)
-                                discard_supported = false;
-                }
-                if (discard_supported &&
-                   mddev->queue->limits.max_discard_sectors >= stripe &&
-                   mddev->queue->limits.discard_granularity >= stripe)
-                        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
-                                                mddev->queue);
-                else
-                        queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
-                                                mddev->queue);
        }
        return 0;
 abort:
        md_unregister_thread(&mddev->thread);
-        print_raid5_conf(conf);
+        if (conf) {
-        free_conf(conf);
+                print_raid5_conf(conf);
+                free_conf(conf);
+        }
        mddev->private = NULL;
        printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
        return -EIO;
 }
-static int stop(struct mddev *mddev)
+static int stop(mddev_t *mddev)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        md_unregister_thread(&mddev->thread);
        if (mddev->queue)
@@ -5625,9 +4968,44 @@ static int stop(struct mddev *mddev)
        return 0;
 }
-static void status(struct seq_file *seq, struct mddev *mddev)
+#ifdef DEBUG
+static void print_sh(struct seq_file *seq, struct stripe_head *sh)
 {
-        struct r5conf *conf = mddev->private;
+        int i;
+        seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
+                   (unsigned long long)sh->sector, sh->pd_idx, sh->state);
+        seq_printf(seq, "sh %llu,  count %d.\n",
+                   (unsigned long long)sh->sector, atomic_read(&sh->count));
+        seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
+        for (i = 0; i < sh->disks; i++) {
+                seq_printf(seq, "(cache%d: %p %ld) ",
+                           i, sh->dev[i].page, sh->dev[i].flags);
+        }
+        seq_printf(seq, "\n");
+}
+static void printall(struct seq_file *seq, raid5_conf_t *conf)
+{
+        struct stripe_head *sh;
+        struct hlist_node *hn;
+        int i;
+        spin_lock_irq(&conf->device_lock);
+        for (i = 0; i < NR_HASH; i++) {
+                hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
+                        if (sh->raid_conf != conf)
+                                continue;
+                        print_sh(seq, sh);
+                }
+        }
+        spin_unlock_irq(&conf->device_lock);
+}
+#endif
+static void status(struct seq_file *seq, mddev_t *mddev)
+{
+        raid5_conf_t *conf = mddev->private;
        int i;
        seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
@@ -5638,9 +5016,13 @@ static void status(struct seq_file *seq, struct mddev *mddev)
                               conf->disks[i].rdev &&
                               test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
        seq_printf (seq, "]");
+#ifdef DEBUG
+        seq_printf (seq, "\n");
+        printall(seq, conf);
+#endif
 }
-static void print_raid5_conf (struct r5conf *conf)
+static void print_raid5_conf (raid5_conf_t *conf)
 {
        int i;
        struct disk_info *tmp;
@@ -5664,35 +5046,17 @@ static void print_raid5_conf (struct r5conf *conf)
        }
 }
-static int raid5_spare_active(struct mddev *mddev)
+static int raid5_spare_active(mddev_t *mddev)
 {
        int i;
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        struct disk_info *tmp;
        int count = 0;
        unsigned long flags;
        for (i = 0; i < conf->raid_disks; i++) {
                tmp = conf->disks + i;
-                if (tmp->replacement
+                if (tmp->rdev
-                    && tmp->replacement->recovery_offset == MaxSector
-                    && !test_bit(Faulty, &tmp->replacement->flags)
-                    && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
-                        /* Replacement has just become active. */
-                        if (!tmp->rdev
-                            || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
-                                count++;
-                        if (tmp->rdev) {
-                                /* Replaced device not technically faulty,
-                                 * but we need to be sure it gets removed
-                                 * and never re-added.
-                                 */
-                                set_bit(Faulty, &tmp->rdev->flags);
-                                sysfs_notify_dirent_safe(
-                                        tmp->rdev->sysfs_state);
-                        }
-                        sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
-                } else if (tmp->rdev
                    && tmp->rdev->recovery_offset == MaxSector
                    && !test_bit(Faulty, &tmp->rdev->flags)
                    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
@@ -5701,77 +5065,58 @@ static int raid5_spare_active(struct mddev *mddev)
                }
        }
        spin_lock_irqsave(&conf->device_lock, flags);
-        mddev->degraded = calc_degraded(conf);
+        mddev->degraded -= count;
        spin_unlock_irqrestore(&conf->device_lock, flags);
        print_raid5_conf(conf);
        return count;
 }
-static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
+static int raid5_remove_disk(mddev_t *mddev, int number)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        int err = 0;
-        int number = rdev->raid_disk;
+        mdk_rdev_t *rdev;
-        struct md_rdev **rdevp;
        struct disk_info *p = conf->disks + number;
        print_raid5_conf(conf);
-        if (rdev == p->rdev)
+        rdev = p->rdev;
-                rdevp = &p->rdev;
+        if (rdev) {
-        else if (rdev == p->replacement)
+                if (number >= conf->raid_disks &&
-                rdevp = &p->replacement;
+                    conf->reshape_progress == MaxSector)
-        else
+                        clear_bit(In_sync, &rdev->flags);
-                return 0;
-        if (number >= conf->raid_disks &&
-            conf->reshape_progress == MaxSector)
-                clear_bit(In_sync, &rdev->flags);
-        if (test_bit(In_sync, &rdev->flags) ||
+                if (test_bit(In_sync, &rdev->flags) ||
-            atomic_read(&rdev->nr_pending)) {
+                    atomic_read(&rdev->nr_pending)) {
-                err = -EBUSY;
+                        err = -EBUSY;
-                goto abort;
+                        goto abort;
-        }
+                }
-        /* Only remove non-faulty devices if recovery
+                /* Only remove non-faulty devices if recovery
-         * isn't possible.
+                 * isn't possible.
-         */
-        if (!test_bit(Faulty, &rdev->flags) &&
-            mddev->recovery_disabled != conf->recovery_disabled &&
-            !has_failed(conf) &&
-            (!p->replacement || p->replacement == rdev) &&
-            number < conf->raid_disks) {
-                err = -EBUSY;
-                goto abort;
-        }
-        *rdevp = NULL;
-        synchronize_rcu();
-        if (atomic_read(&rdev->nr_pending)) {
-                /* lost the race, try later */
-                err = -EBUSY;
-                *rdevp = rdev;
-        } else if (p->replacement) {
-                /* We must have just cleared 'rdev' */
-                p->rdev = p->replacement;
-                clear_bit(Replacement, &p->replacement->flags);
-                smp_mb(); /* Make sure other CPUs may see both as identical
-                           * but will never see neither - if they are careful
-                           */
-                p->replacement = NULL;
-                clear_bit(WantReplacement, &rdev->flags);
-        } else
-                /* We might have just removed the Replacement as faulty-
-                 * clear the bit just in case
                 */
-                clear_bit(WantReplacement, &rdev->flags);
+                if (!test_bit(Faulty, &rdev->flags) &&
+                    mddev->recovery_disabled != conf->recovery_disabled &&
+                    !has_failed(conf) &&
+                    number < conf->raid_disks) {
+                        err = -EBUSY;
+                        goto abort;
+                }
+                p->rdev = NULL;
+                synchronize_rcu();
+                if (atomic_read(&rdev->nr_pending)) {
+                        /* lost the race, try later */
+                        err = -EBUSY;
+                        p->rdev = rdev;
+                }
+        }
 abort:
        print_raid5_conf(conf);
        return err;
 }
-static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
+static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        int err = -EEXIST;
        int disk;
        struct disk_info *p;
@@ -5781,7 +5126,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        if (mddev->recovery_disabled == conf->recovery_disabled)
                return -EBUSY;
-        if (rdev->saved_raid_disk < 0 && has_failed(conf))
+        if (has_failed(conf))
                /* no point adding a device */
                return -EINVAL;
@@ -5795,39 +5140,24 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        if (rdev->saved_raid_disk >= 0 &&
            rdev->saved_raid_disk >= first &&
            conf->disks[rdev->saved_raid_disk].rdev == NULL)
-                first = rdev->saved_raid_disk;
+                disk = rdev->saved_raid_disk;
+        else
-        for (disk = first; disk <= last; disk++) {
+                disk = first;
-                p = conf->disks + disk;
+        for ( ; disk <= last ; disk++)
-                if (p->rdev == NULL) {
+                if ((p=conf->disks + disk)->rdev == NULL) {
                        clear_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = disk;
                        err = 0;
                        if (rdev->saved_raid_disk != disk)
                                conf->fullsync = 1;
                        rcu_assign_pointer(p->rdev, rdev);
-                        goto out;
-                }
-        }
-        for (disk = first; disk <= last; disk++) {
-                p = conf->disks + disk;
-                if (test_bit(WantReplacement, &p->rdev->flags) &&
-                    p->replacement == NULL) {
-                        clear_bit(In_sync, &rdev->flags);
-                        set_bit(Replacement, &rdev->flags);
-                        rdev->raid_disk = disk;
-                        err = 0;
-                        conf->fullsync = 1;
-                        rcu_assign_pointer(p->replacement, rdev);
                        break;
                }
-        }
-out:
        print_raid5_conf(conf);
        return err;
 }
-static int raid5_resize(struct mddev *mddev, sector_t sectors)
+static int raid5_resize(mddev_t *mddev, sector_t sectors)
 {
        /* no resync is happening, and there is enough space
         * on all devices, so we can resize.
@@ -5836,18 +5166,12 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
         * any io in the removed space completes, but it hardly seems
         * worth it.
         */
-        sector_t newsize;
        sectors &= ~((sector_t)mddev->chunk_sectors - 1);
-        newsize = raid5_size(mddev, sectors, mddev->raid_disks);
+        md_set_array_sectors(mddev, raid5_size(mddev, sectors,
-        if (mddev->external_size &&
+                                               mddev->raid_disks));
-            mddev->array_sectors > newsize)
+        if (mddev->array_sectors >
+            raid5_size(mddev, sectors, mddev->raid_disks))
                return -EINVAL;
-        if (mddev->bitmap) {
-                int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0);
-                if (ret)
-                        return ret;
-        }
-        md_set_array_sectors(mddev, newsize);
        set_capacity(mddev->gendisk, mddev->array_sectors);
        revalidate_disk(mddev->gendisk);
        if (sectors > mddev->dev_sectors &&
@@ -5860,7 +5184,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
        return 0;
 }
-static int check_stripe_cache(struct mddev *mddev)
+static int check_stripe_cache(mddev_t *mddev)
 {
        /* Can only proceed if there are plenty of stripe_heads.
         * We need a minimum of one full stripe,, and for sensible progress
@@ -5870,7 +5194,7 @@ static int check_stripe_cache(struct mddev *mddev)
         * If the chunk size is greater, user-space should request more
         * stripe_heads first.
         */
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
            > conf->max_nr_stripes ||
            ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
@@ -5884,14 +5208,17 @@ static int check_stripe_cache(struct mddev *mddev)
        return 1;
 }
-static int check_reshape(struct mddev *mddev)
+static int check_reshape(mddev_t *mddev)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        if (mddev->delta_disks == 0 &&
            mddev->new_layout == mddev->layout &&
            mddev->new_chunk_sectors == mddev->chunk_sectors)
                return 0; /* nothing to do */
+        if (mddev->bitmap)
+                /* Cannot grow a bitmap yet */
+                return -EBUSY;
        if (has_failed(conf))
                return -EINVAL;
        if (mddev->delta_disks < 0) {
@@ -5910,14 +5237,13 @@ static int check_reshape(struct mddev *mddev)
        if (!check_stripe_cache(mddev))
                return -ENOSPC;
-        return resize_stripes(conf, (conf->previous_raid_disks
+        return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
-                                     + mddev->delta_disks));
 }
-static int raid5_start_reshape(struct mddev *mddev)
+static int raid5_start_reshape(mddev_t *mddev)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
-        struct md_rdev *rdev;
+        mdk_rdev_t *rdev;
        int spares = 0;
        unsigned long flags;
@@ -5927,14 +5253,10 @@ static int raid5_start_reshape(struct mddev *mddev)
        if (!check_stripe_cache(mddev))
                return -ENOSPC;
-        if (has_failed(conf))
+        list_for_each_entry(rdev, &mddev->disks, same_set)
-                return -EINVAL;
-        rdev_for_each(rdev, mddev) {
                if (!test_bit(In_sync, &rdev->flags)
                    && !test_bit(Faulty, &rdev->flags))
                        spares++;
-        }
        if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
                /* Not enough devices even to make a degraded array
@@ -5961,16 +5283,12 @@ static int raid5_start_reshape(struct mddev *mddev)
        conf->chunk_sectors = mddev->new_chunk_sectors;
        conf->prev_algo = conf->algorithm;
        conf->algorithm = mddev->new_layout;
-        conf->generation++;
+        if (mddev->delta_disks < 0)
-        /* Code that selects data_offset needs to see the generation update
-         * if reshape_progress has been set - so a memory barrier needed.
-         */
-        smp_mb();
-        if (mddev->reshape_backwards)
                conf->reshape_progress = raid5_size(mddev, 0, 0);
        else
                conf->reshape_progress = 0;
        conf->reshape_safe = conf->reshape_progress;
+        conf->generation++;
        spin_unlock_irq(&conf->device_lock);
        /* Add some new drives, as many as will fit.
@@ -5981,14 +5299,16 @@ static int raid5_start_reshape(struct mddev *mddev)
         * such devices during the reshape and confusion could result.
         */
        if (mddev->delta_disks >= 0) {
-                rdev_for_each(rdev, mddev)
+                int added_devices = 0;
+                list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk < 0 &&
                            !test_bit(Faulty, &rdev->flags)) {
                                if (raid5_add_disk(mddev, rdev) == 0) {
                                        if (rdev->raid_disk
-                                            >= conf->previous_raid_disks)
+                                            >= conf->previous_raid_disks) {
                                                set_bit(In_sync, &rdev->flags);
-                                        else
+                                                added_devices++;
+                                        } else
                                                rdev->recovery_offset = 0;
                                        if (sysfs_link_rdev(mddev, rdev))
@@ -5998,6 +5318,7 @@ static int raid5_start_reshape(struct mddev *mddev)
                                   && !test_bit(Faulty, &rdev->flags)) {
                                /* This is a spare that was manually added */
                                set_bit(In_sync, &rdev->flags);
+                                added_devices++;
                        }
                /* When a reshape changes the number of devices,
@@ -6005,7 +5326,8 @@ static int raid5_start_reshape(struct mddev *mddev)
                 * pre and post number of devices.
                 */
                spin_lock_irqsave(&conf->device_lock, flags);
-                mddev->degraded = calc_degraded(conf);
+                mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
+                        - added_devices;
                spin_unlock_irqrestore(&conf->device_lock, flags);
        }
        mddev->raid_disks = conf->raid_disks;
@@ -6022,11 +5344,7 @@ static int raid5_start_reshape(struct mddev *mddev)
                mddev->recovery = 0;
                spin_lock_irq(&conf->device_lock);
                mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
-                rdev_for_each(rdev, mddev)
-                        rdev->new_data_offset = rdev->data_offset;
-                smp_wmb();
                conf->reshape_progress = MaxSector;
-                mddev->reshape_position = MaxSector;
                spin_unlock_irq(&conf->device_lock);
                return -EAGAIN;
        }
@@ -6039,17 +5357,13 @@ static int raid5_start_reshape(struct mddev *mddev)
 /* This is called from the reshape thread and should make any
 * changes needed in 'conf'
 */
-static void end_reshape(struct r5conf *conf)
+static void end_reshape(raid5_conf_t *conf)
 {
        if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
-                struct md_rdev *rdev;
                spin_lock_irq(&conf->device_lock);
                conf->previous_raid_disks = conf->raid_disks;
-                rdev_for_each(rdev, conf->mddev)
-                        rdev->data_offset = rdev->new_data_offset;
-                smp_wmb();
                conf->reshape_progress = MaxSector;
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
@@ -6070,9 +5384,9 @@ static void end_reshape(struct r5conf *conf)
 /* This is called from the raid5d thread with mddev_lock held.
 * It makes config changes to the device.
 */
-static void raid5_finish_reshape(struct mddev *mddev)
+static void raid5_finish_reshape(mddev_t *mddev)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -6082,31 +5396,32 @@ static void raid5_finish_reshape(struct mddev *mddev)
                        revalidate_disk(mddev->gendisk);
                } else {
                        int d;
-                        spin_lock_irq(&conf->device_lock);
+                        mddev->degraded = conf->raid_disks;
-                        mddev->degraded = calc_degraded(conf);
+                        for (d = 0; d < conf->raid_disks ; d++)
-                        spin_unlock_irq(&conf->device_lock);
+                                if (conf->disks[d].rdev &&
+                                    test_bit(In_sync,
+                                             &conf->disks[d].rdev->flags))
+                                        mddev->degraded--;
                        for (d = conf->raid_disks ;
                             d < conf->raid_disks - mddev->delta_disks;
                             d++) {
-                                struct md_rdev *rdev = conf->disks[d].rdev;
+                                mdk_rdev_t *rdev = conf->disks[d].rdev;
-                                if (rdev)
+                                if (rdev && raid5_remove_disk(mddev, d) == 0) {
-                                        clear_bit(In_sync, &rdev->flags);
+                                        sysfs_unlink_rdev(mddev, rdev);
-                                rdev = conf->disks[d].replacement;
+                                        rdev->raid_disk = -1;
-                                if (rdev)
+                                }
-                                        clear_bit(In_sync, &rdev->flags);
                        }
                }
                mddev->layout = conf->algorithm;
                mddev->chunk_sectors = conf->chunk_sectors;
                mddev->reshape_position = MaxSector;
                mddev->delta_disks = 0;
-                mddev->reshape_backwards = 0;
        }
 }
-static void raid5_quiesce(struct mddev *mddev, int state)
+static void raid5_quiesce(mddev_t *mddev, int state)
 {
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        switch(state) {
        case 2: /* resume for a suspend */
@@ -6122,7 +5437,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
                wait_event_lock_irq(conf->wait_for_stripe,
                                    atomic_read(&conf->active_stripes) == 0 &&
                                    atomic_read(&conf->active_aligned_reads) == 0,
-                                    conf->device_lock);
+                                    conf->device_lock, /* nothing */);
                conf->quiesce = 1;
                spin_unlock_irq(&conf->device_lock);
                /* allow reshape to continue */
@@ -6140,20 +5455,20 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 }
-static void *raid45_takeover_raid0(struct mddev *mddev, int level)
+static void *raid45_takeover_raid0(mddev_t *mddev, int level)
 {
-        struct r0conf *raid0_conf = mddev->private;
+        struct raid0_private_data *raid0_priv = mddev->private;
        sector_t sectors;
        /* for raid0 takeover only one zone is supported */
-        if (raid0_conf->nr_strip_zones > 1) {
+        if (raid0_priv->nr_strip_zones > 1) {
                printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
                       mdname(mddev));
                return ERR_PTR(-EINVAL);
        }
-        sectors = raid0_conf->strip_zone[0].zone_end;
+        sectors = raid0_priv->strip_zone[0].zone_end;
-        sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
+        sector_div(sectors, raid0_priv->strip_zone[0].nb_dev);
        mddev->dev_sectors = sectors;
        mddev->new_level = level;
        mddev->new_layout = ALGORITHM_PARITY_N;
@@ -6167,7 +5482,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
 }
-static void *raid5_takeover_raid1(struct mddev *mddev)
+static void *raid5_takeover_raid1(mddev_t *mddev)
 {
        int chunksect;
@@ -6194,7 +5509,7 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
        return setup_conf(mddev);
 }
-static void *raid5_takeover_raid6(struct mddev *mddev)
+static void *raid5_takeover_raid6(mddev_t *mddev)
 {
        int new_layout;
@@ -6228,14 +5543,14 @@ static void *raid5_takeover_raid6(struct mddev *mddev)
 }
-static int raid5_check_reshape(struct mddev *mddev)
+static int raid5_check_reshape(mddev_t *mddev)
 {
        /* For a 2-drive array, the layout and chunk size can be changed
         * immediately as not restriping is needed.
         * For larger arrays we record the new value - after validation
         * to be used by a reshape pass.
         */
-        struct r5conf *conf = mddev->private;
+        raid5_conf_t *conf = mddev->private;
        int new_chunk = mddev->new_chunk_sectors;
        if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
@@ -6268,7 +5583,7 @@ static int raid5_check_reshape(struct mddev *mddev)
        return check_reshape(mddev);
 }
-static int raid6_check_reshape(struct mddev *mddev)
+static int raid6_check_reshape(mddev_t *mddev)
 {
        int new_chunk = mddev->new_chunk_sectors;
@@ -6288,7 +5603,7 @@ static int raid6_check_reshape(struct mddev *mddev)
        return check_reshape(mddev);
 }
-static void *raid5_takeover(struct mddev *mddev)
+static void *raid5_takeover(mddev_t *mddev)
 {
        /* raid5 can take over:
         *  raid0 - if there is only one strip zone - make it a raid4 layout
@@ -6311,7 +5626,7 @@ static void *raid5_takeover(struct mddev *mddev)
        return ERR_PTR(-EINVAL);
 }
-static void *raid4_takeover(struct mddev *mddev)
+static void *raid4_takeover(mddev_t *mddev)
 {
        /* raid4 can take over:
         *  raid0 - if there is only one strip zone
@@ -6328,9 +5643,9 @@ static void *raid4_takeover(struct mddev *mddev)
        return ERR_PTR(-EINVAL);
 }
-static struct md_personality raid5_personality;
+static struct mdk_personality raid5_personality;
-static void *raid6_takeover(struct mddev *mddev)
+static void *raid6_takeover(mddev_t *mddev)
 {
        /* Currently can only take over a raid5.  We map the
         * personality to an equivalent raid6 personality
@@ -6377,7 +5692,7 @@ static void *raid6_takeover(struct mddev *mddev)
 }
-static struct md_personality raid6_personality =
+static struct mdk_personality raid6_personality =
 {
        .name           = "raid6",
        .level          = 6,
@@ -6399,7 +5714,7 @@ static struct md_personality raid6_personality =
        .quiesce        = raid5_quiesce,
        .takeover       = raid6_takeover,
 };
-static struct md_personality raid5_personality =
+static struct mdk_personality raid5_personality =
 {
        .name           = "raid5",
        .level          = 5,
@@ -6422,7 +5737,7 @@ static struct md_personality raid5_personality =
        .takeover       = raid5_takeover,
 };
-static struct md_personality raid4_personality =
+static struct mdk_personality raid4_personality =
 {
        .name           = "raid4",
        .level          = 4,
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 18b2c4a8a1f..11b9566184b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -27,7 +27,7 @@
 * The possible state transitions are:
 *
 *  Empty -> Want   - on read or write to get old data for  parity calc
- *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.
+ *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
 *  Empty -> Clean  - on compute_block when computing a block for failed drive
 *  Want  -> Empty  - on failed read
 *  Want  -> Clean  - on successful completion of read request
@@ -197,7 +197,7 @@ enum reconstruct_states {
 struct stripe_head {
        struct hlist_node       hash;
        struct list_head        lru;          /* inactive_list or handle_list */
-        struct r5conf           *raid_conf;
+        struct raid5_private_data *raid_conf;
        short                   generation;     /* increments with every
                                                 * reshape */
        sector_t                sector;         /* sector of this row */
@@ -210,7 +210,6 @@ struct stripe_head {
        int                     disks;          /* disks in stripe */
        enum check_states       check_state;
        enum reconstruct_states reconstruct_state;
-        spinlock_t              stripe_lock;
        /**
         * struct stripe_operations
         * @target - STRIPE_OP_COMPUTE_BLK target
@@ -227,11 +226,8 @@ struct stripe_head {
                #endif
        } ops;
        struct r5dev {
-                /* rreq and rvec are used for the replacement device when
+                struct bio      req;
-                 * writing data to both devices.
+                struct bio_vec  vec;
-                 */
-                struct bio      req, rreq;
-                struct bio_vec  vec, rvec;
                struct page     *page;
                struct bio      *toread, *read, *towrite, *written;
                sector_t        sector;                 /* sector of this page */
@@ -243,13 +239,7 @@ struct stripe_head {
 *     for handle_stripe.
 */
 struct stripe_head_state {
-        /* 'syncing' means that we need to read all devices, either
+        int syncing, expanding, expanded;
-         * to check/correct parity, or to reconstruct a missing device.
-         * 'replacing' means we are replacing one or more drives and
-         * the source is valid at this point so we don't need to
-         * read all devices, just the replacement targets.
-         */
-        int syncing, expanding, expanded, replacing;
        int locked, uptodate, to_read, to_write, failed, written;
        int to_fill, compute, req_compute, non_overwrite;
        int failed_num[2];
@@ -258,48 +248,42 @@ struct stripe_head_state {
        unsigned long ops_request;
        struct bio *return_bi;
-        struct md_rdev *blocked_rdev;
+        mdk_rdev_t *blocked_rdev;
        int handle_bad_blocks;
 };
-/* Flags for struct r5dev.flags */
+/* Flags */
-enum r5dev_flags {
+#define R5_UPTODATE     0       /* page contains current data */
-        R5_UPTODATE,    /* page contains current data */
+#define R5_LOCKED       1       /* IO has been submitted on "req" */
-        R5_LOCKED,      /* IO has been submitted on "req" */
+#define R5_OVERWRITE    2       /* towrite covers whole page */
-        R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
-        R5_OVERWRITE,   /* towrite covers whole page */
 /* and some that are internal to handle_stripe */
-        R5_Insync,      /* rdev && rdev->in_sync at start */
+#define R5_Insync       3       /* rdev && rdev->in_sync at start */
-        R5_Wantread,    /* want to schedule a read */
+#define R5_Wantread     4       /* want to schedule a read */
-        R5_Wantwrite,
+#define R5_Wantwrite    5
-        R5_Overlap,     /* There is a pending overlapping request
+#define R5_Overlap      7       /* There is a pending overlapping request on this block */
-                         * on this block */
+#define R5_ReadError    8       /* seen a read error here recently */
-        R5_ReadNoMerge, /* prevent bio from merging in block-layer */
+#define R5_ReWrite      9       /* have tried to over-write the readerror */
-        R5_ReadError,   /* seen a read error here recently */
-        R5_ReWrite,     /* have tried to over-write the readerror */
+#define R5_Expanded     10      /* This block now has post-expand data */
+#define R5_Wantcompute  11      /* compute_block in progress treat as
-        R5_Expanded,    /* This block now has post-expand data */
+                                 * uptodate
-        R5_Wantcompute, /* compute_block in progress treat as
+                                 */
-                         * uptodate
+#define R5_Wantfill     12      /* dev->toread contains a bio that needs
-                         */
+                                 * filling
-        R5_Wantfill,    /* dev->toread contains a bio that needs
+                                 */
-                         * filling
+#define R5_Wantdrain    13      /* dev->towrite needs to be drained */
-                         */
+#define R5_WantFUA      14      /* Write should be FUA */
-        R5_Wantdrain,   /* dev->towrite needs to be drained */
+#define R5_WriteError   15      /* got a write error - need to record it */
-        R5_WantFUA,     /* Write should be FUA */
+#define R5_MadeGood     16      /* A bad block has been fixed by writing to it*/
-        R5_SyncIO,      /* The IO is sync */
+/*
-        R5_WriteError,  /* got a write error - need to record it */
+ * Write method
-        R5_MadeGood,    /* A bad block has been fixed by writing to it */
+ */
-        R5_ReadRepl,    /* Will/did read from replacement rather than orig */
+#define RECONSTRUCT_WRITE       1
-        R5_MadeGoodRepl,/* A bad block on the replacement device has been
+#define READ_MODIFY_WRITE       2
-                         * fixed by writing to it */
+/* not a write method, but a compute_parity mode */
-        R5_NeedReplace, /* This device has a replacement which is not
+#define CHECK_PARITY            3
-                         * up-to-date at this stripe. */
+/* Additional compute_parity mode -- updates the parity w/o LOCKING */
-        R5_WantReplace, /* We need to update the replacement, we have read
+#define UPDATE_PARITY           4
-                         * data in, and now is a good time to write it out.
-                         */
-        R5_Discard,     /* Discard the stripe */
-};
 /*
 * Stripe state
@@ -322,20 +306,18 @@ enum {
        STRIPE_BIOFILL_RUN,
        STRIPE_COMPUTE_RUN,
        STRIPE_OPS_REQ_PENDING,
-        STRIPE_ON_UNPLUG_LIST,
 };
 /*
 * Operation request flags
 */
-enum {
+#define STRIPE_OP_BIOFILL       0
-        STRIPE_OP_BIOFILL,
+#define STRIPE_OP_COMPUTE_BLK   1
-        STRIPE_OP_COMPUTE_BLK,
+#define STRIPE_OP_PREXOR        2
-        STRIPE_OP_PREXOR,
+#define STRIPE_OP_BIODRAIN      3
-        STRIPE_OP_BIODRAIN,
+#define STRIPE_OP_RECONSTRUCT   4
-        STRIPE_OP_RECONSTRUCT,
+#define STRIPE_OP_CHECK 5
-        STRIPE_OP_CHECK,
-};
 /*
 * Plugging:
 *
@@ -362,12 +344,13 @@ enum {
 struct disk_info {
-        struct md_rdev  *rdev, *replacement;
+        mdk_rdev_t      *rdev;
 };
-struct r5conf {
+struct raid5_private_data {
        struct hlist_head       *stripe_hashtbl;
-        struct mddev            *mddev;
+        mddev_t                 *mddev;
+        struct disk_info        *spare;
        int                     chunk_sectors;
        int                     level, algorithm;
        int                     max_degraded;
@@ -390,12 +373,6 @@ struct r5conf {
        short                   generation; /* increments with every reshape */
        unsigned long           reshape_checkpoint; /* Time we last updated
                                                     * metadata */
-        long long               min_offset_diff; /* minimum difference between
-                                                  * data_offset and
-                                                  * new_data_offset across all
-                                                  * devices.  May be negative,
-                                                  * but is closest to zero.
-                                                  */
        struct list_head        handle_list; /* stripes needing handling */
        struct list_head        hold_list; /* preread ready stripes */
@@ -459,9 +436,11 @@ struct r5conf {
        /* When taking over an array from a different personality, we store
         * the new thread here until we fully activate the array.
         */
-        struct md_thread        *thread;
+        struct mdk_thread_s     *thread;
 };
+typedef struct raid5_private_data raid5_conf_t;
 /*
 * Our supported algorithms
 */
@@ -524,7 +503,7 @@ static inline int algorithm_is_DDF(int layout)
        return layout >= 8 && layout <= 10;
 }
-extern int md_raid5_congested(struct mddev *mddev, int bits);
+extern int md_raid5_congested(mddev_t *mddev, int bits);
-extern void md_raid5_kick_device(struct r5conf *conf);
+extern void md_raid5_kick_device(raid5_conf_t *conf);
-extern int raid5_set_cache_size(struct mddev *mddev, int size);
+extern int raid5_set_cache_size(mddev_t *mddev, int size);
 #endif
author	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-17 16:15:55 -0500
committer	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-17 16:15:55 -0500
commit	8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
tree	a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /drivers/md
parent	406089d01562f1e2bf9f089fd7637009ebaad589 (diff)