Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /drivers/md
54 files changed, 28483 insertions, 0 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
new file mode 100644
index 000000000000..ac43f98062fd
--- /dev/null
+++ b/drivers/md/Kconfig
@@ -0,0 +1,240 @@
+#
+# Block device driver configuration
+#
+menu "Multi-device support (RAID and LVM)"
+config MD
+        bool "Multiple devices driver support (RAID and LVM)"
+        help
+          Support multiple physical spindles through a single logical device.
+          Required for RAID and logical volume management.
+config BLK_DEV_MD
+        tristate "RAID support"
+        depends on MD
+        ---help---
+          This driver lets you combine several hard disk partitions into one
+          logical block device. This can be used to simply append one
+          partition to another one or to combine several redundant hard disks
+          into a RAID1/4/5 device so as to provide protection against hard
+          disk failures. This is called "Software RAID" since the combining of
+          the partitions is done by the kernel. "Hardware RAID" means that the
+          combining is done by a dedicated controller; if you have such a
+          controller, you do not need to say Y here.
+          More information about Software RAID on Linux is contained in the
+          Software RAID mini-HOWTO, available from
+          <http://www.tldp.org/docs.html#howto>. There you will also learn
+          where to get the supporting user space utilities raidtools.
+          If unsure, say N.
+config MD_LINEAR
+        tristate "Linear (append) mode"
+        depends on BLK_DEV_MD
+        ---help---
+          If you say Y here, then your multiple devices driver will be able to
+          use the so-called linear mode, i.e. it will combine the hard disk
+          partitions by simply appending one to the other.
+          To compile this as a module, choose M here: the module
+          will be called linear.
+          If unsure, say Y.
+config MD_RAID0
+        tristate "RAID-0 (striping) mode"
+        depends on BLK_DEV_MD
+        ---help---
+          If you say Y here, then your multiple devices driver will be able to
+          use the so-called raid0 mode, i.e. it will combine the hard disk
+          partitions into one logical device in such a fashion as to fill them
+          up evenly, one chunk here and one chunk there. This will increase
+          the throughput rate if the partitions reside on distinct disks.
+          Information about Software RAID on Linux is contained in the
+          Software-RAID mini-HOWTO, available from
+          <http://www.tldp.org/docs.html#howto>. There you will also
+          learn where to get the supporting user space utilities raidtools.
+          To compile this as a module, choose M here: the module
+          will be called raid0.
+          If unsure, say Y.
+config MD_RAID1
+        tristate "RAID-1 (mirroring) mode"
+        depends on BLK_DEV_MD
+        ---help---
+          A RAID-1 set consists of several disk drives which are exact copies
+          of each other.  In the event of a mirror failure, the RAID driver
+          will continue to use the operational mirrors in the set, providing
+          an error free MD (multiple device) to the higher levels of the
+          kernel.  In a set with N drives, the available space is the capacity
+          of a single drive, and the set protects against a failure of (N - 1)
+          drives.
+          Information about Software RAID on Linux is contained in the
+          Software-RAID mini-HOWTO, available from
+          <http://www.tldp.org/docs.html#howto>.  There you will also
+          learn where to get the supporting user space utilities raidtools.
+          If you want to use such a RAID-1 set, say Y.  To compile this code
+          as a module, choose M here: the module will be called raid1.
+          If unsure, say Y.
+config MD_RAID10
+        tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)"
+        depends on BLK_DEV_MD && EXPERIMENTAL
+        ---help---
+          RAID-10 provides a combination of striping (RAID-0) and
+          mirroring (RAID-1) with easier configuration and more flexable
+          layout.
+          Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to
+          be the same size (or at least, only as much as the smallest device
+          will be used).
+          RAID-10 provides a variety of layouts that provide different levels
+          of redundancy and performance.
+          RAID-10 requires mdadm-1.7.0 or later, available at:
+          ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
+          If unsure, say Y.
+config MD_RAID5
+        tristate "RAID-4/RAID-5 mode"
+        depends on BLK_DEV_MD
+        ---help---
+          A RAID-5 set of N drives with a capacity of C MB per drive provides
+          the capacity of C * (N - 1) MB, and protects against a failure
+          of a single drive. For a given sector (row) number, (N - 1) drives
+          contain data sectors, and one drive contains the parity protection.
+          For a RAID-4 set, the parity blocks are present on a single drive,
+          while a RAID-5 set distributes the parity across the drives in one
+          of the available parity distribution methods.
+          Information about Software RAID on Linux is contained in the
+          Software-RAID mini-HOWTO, available from
+          <http://www.tldp.org/docs.html#howto>. There you will also
+          learn where to get the supporting user space utilities raidtools.
+          If you want to use such a RAID-4/RAID-5 set, say Y.  To
+          compile this code as a module, choose M here: the module
+          will be called raid5.
+          If unsure, say Y.
+config MD_RAID6
+        tristate "RAID-6 mode"
+        depends on BLK_DEV_MD
+        ---help---
+          A RAID-6 set of N drives with a capacity of C MB per drive
+          provides the capacity of C * (N - 2) MB, and protects
+          against a failure of any two drives. For a given sector
+          (row) number, (N - 2) drives contain data sectors, and two
+          drives contains two independent redundancy syndromes.  Like
+          RAID-5, RAID-6 distributes the syndromes across the drives
+          in one of the available parity distribution methods.
+          RAID-6 requires mdadm-1.5.0 or later, available at:
+          ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
+          If you want to use such a RAID-6 set, say Y.  To compile
+          this code as a module, choose M here: the module will be
+          called raid6.
+          If unsure, say Y.
+config MD_MULTIPATH
+        tristate "Multipath I/O support"
+        depends on BLK_DEV_MD
+        help
+          Multipath-IO is the ability of certain devices to address the same
+          physical disk over multiple 'IO paths'. The code ensures that such
+          paths can be defined and handled at runtime, and ensures that a
+          transparent failover to the backup path(s) happens if a IO errors
+          arrives on the primary path.
+          If unsure, say N.
+config MD_FAULTY
+        tristate "Faulty test module for MD"
+        depends on BLK_DEV_MD
+        help
+          The "faulty" module allows for a block device that occasionally returns
+          read or write errors.  It is useful for testing.
+          In unsure, say N.
+config BLK_DEV_DM
+        tristate "Device mapper support"
+        depends on MD
+        ---help---
+          Device-mapper is a low level volume manager.  It works by allowing
+          people to specify mappings for ranges of logical sectors.  Various
+          mapping types are available, in addition people may write their own
+          modules containing custom mappings if they wish.
+          Higher level volume managers such as LVM2 use this driver.
+          To compile this as a module, choose M here: the module will be
+          called dm-mod.
+          If unsure, say N.
+config DM_CRYPT
+        tristate "Crypt target support"
+        depends on BLK_DEV_DM && EXPERIMENTAL
+        select CRYPTO
+        ---help---
+          This device-mapper target allows you to create a device that
+          transparently encrypts the data on it. You'll need to activate
+          the ciphers you're going to use in the cryptoapi configuration.
+          Information on how to use dm-crypt can be found on
+          <http://www.saout.de/misc/dm-crypt/>
+          To compile this code as a module, choose M here: the module will
+          be called dm-crypt.
+          If unsure, say N.
+config DM_SNAPSHOT
+       tristate "Snapshot target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       ---help---
+         Allow volume managers to take writeable snapshots of a device.
+config DM_MIRROR
+       tristate "Mirror target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       ---help---
+         Allow volume managers to mirror logical volumes, also
+         needed for live data migration tools such as 'pvmove'.
+config DM_ZERO
+        tristate "Zero target (EXPERIMENTAL)"
+        depends on BLK_DEV_DM && EXPERIMENTAL
+        ---help---
+          A target that discards writes, and returns all zeroes for
+          reads.  Useful in some recovery situations.
+config DM_MULTIPATH
+        tristate "Multipath target (EXPERIMENTAL)"
+        depends on BLK_DEV_DM && EXPERIMENTAL
+        ---help---
+          Allow volume managers to support multipath hardware.
+config DM_MULTIPATH_EMC
+        tristate "EMC CX/AX multipath support (EXPERIMENTAL)"
+        depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL
+        ---help---
+          Multipath support for EMC CX/AX series hardware.
+endmenu
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
new file mode 100644
index 000000000000..90de9c146a5f
--- /dev/null
+++ b/drivers/md/Makefile
@@ -0,0 +1,107 @@
+#
+# Makefile for the kernel software RAID and LVM drivers.
+#
+dm-mod-objs     := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
+                   dm-ioctl.o dm-io.o kcopyd.o
+dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
+dm-snapshot-objs := dm-snap.o dm-exception-store.o
+dm-mirror-objs  := dm-log.o dm-raid1.o
+raid6-objs      := raid6main.o raid6algos.o raid6recov.o raid6tables.o \
+                   raid6int1.o raid6int2.o raid6int4.o \
+                   raid6int8.o raid6int16.o raid6int32.o \
+                   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
+                   raid6altivec8.o \
+                   raid6mmx.o raid6sse1.o raid6sse2.o
+hostprogs-y     := mktables
+# Note: link order is important.  All raid personalities
+# and xor.o must come before md.o, as they each initialise 
+# themselves, and md.o may use the personalities when it 
+# auto-initialised.
+obj-$(CONFIG_MD_LINEAR)         += linear.o
+obj-$(CONFIG_MD_RAID0)          += raid0.o
+obj-$(CONFIG_MD_RAID1)          += raid1.o
+obj-$(CONFIG_MD_RAID10)         += raid10.o
+obj-$(CONFIG_MD_RAID5)          += raid5.o xor.o
+obj-$(CONFIG_MD_RAID6)          += raid6.o xor.o
+obj-$(CONFIG_MD_MULTIPATH)      += multipath.o
+obj-$(CONFIG_MD_FAULTY)         += faulty.o
+obj-$(CONFIG_BLK_DEV_MD)        += md.o
+obj-$(CONFIG_BLK_DEV_DM)        += dm-mod.o
+obj-$(CONFIG_DM_CRYPT)          += dm-crypt.o
+obj-$(CONFIG_DM_MULTIPATH)      += dm-multipath.o dm-round-robin.o
+obj-$(CONFIG_DM_MULTIPATH_EMC)  += dm-emc.o
+obj-$(CONFIG_DM_SNAPSHOT)       += dm-snapshot.o
+obj-$(CONFIG_DM_MIRROR)         += dm-mirror.o
+obj-$(CONFIG_DM_ZERO)           += dm-zero.o
+quiet_cmd_unroll = UNROLL  $@
+      cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
+                   < $< > $@ || ( rm -f $@ && exit 1 )
+ifeq ($(CONFIG_ALTIVEC),y)
+altivec_flags := -maltivec -mabi=altivec
+endif
+targets += raid6int1.c
+$(obj)/raid6int1.c:   UNROLL := 1
+$(obj)/raid6int1.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
+        $(call if_changed,unroll)
+targets += raid6int2.c
+$(obj)/raid6int2.c:   UNROLL := 2
+$(obj)/raid6int2.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
+        $(call if_changed,unroll)
+targets += raid6int4.c
+$(obj)/raid6int4.c:   UNROLL := 4
+$(obj)/raid6int4.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
+        $(call if_changed,unroll)
+targets += raid6int8.c
+$(obj)/raid6int8.c:   UNROLL := 8
+$(obj)/raid6int8.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
+        $(call if_changed,unroll)
+targets += raid6int16.c
+$(obj)/raid6int16.c:  UNROLL := 16
+$(obj)/raid6int16.c:  $(src)/raid6int.uc $(src)/unroll.pl FORCE
+        $(call if_changed,unroll)
+targets += raid6int32.c
+$(obj)/raid6int32.c:  UNROLL := 32
+$(obj)/raid6int32.c:  $(src)/raid6int.uc $(src)/unroll.pl FORCE
+        $(call if_changed,unroll)
+CFLAGS_raid6altivec1.o += $(altivec_flags)
+targets += raid6altivec1.c
+$(obj)/raid6altivec1.c:   UNROLL := 1
+$(obj)/raid6altivec1.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
+        $(call if_changed,unroll)
+CFLAGS_raid6altivec2.o += $(altivec_flags)
+targets += raid6altivec2.c
+$(obj)/raid6altivec2.c:   UNROLL := 2
+$(obj)/raid6altivec2.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
+        $(call if_changed,unroll)
+CFLAGS_raid6altivec4.o += $(altivec_flags)
+targets += raid6altivec4.c
+$(obj)/raid6altivec4.c:   UNROLL := 4
+$(obj)/raid6altivec4.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
+        $(call if_changed,unroll)
+CFLAGS_raid6altivec8.o += $(altivec_flags)
+targets += raid6altivec8.c
+$(obj)/raid6altivec8.c:   UNROLL := 8
+$(obj)/raid6altivec8.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
+        $(call if_changed,unroll)
+quiet_cmd_mktable = TABLE   $@
+      cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
+targets += raid6tables.c
+$(obj)/raid6tables.c: $(obj)/mktables FORCE
+        $(call if_changed,mktable)
diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h
new file mode 100644
index 000000000000..bc021e1fd4d1
--- /dev/null
+++ b/drivers/md/dm-bio-list.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2004 Red Hat UK Ltd.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef DM_BIO_LIST_H
+#define DM_BIO_LIST_H
+#include <linux/bio.h>
+struct bio_list {
+        struct bio *head;
+        struct bio *tail;
+};
+static inline void bio_list_init(struct bio_list *bl)
+{
+        bl->head = bl->tail = NULL;
+}
+static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
+{
+        bio->bi_next = NULL;
+        if (bl->tail)
+                bl->tail->bi_next = bio;
+        else
+                bl->head = bio;
+        bl->tail = bio;
+}
+static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
+{
+        if (bl->tail)
+                bl->tail->bi_next = bl2->head;
+        else
+                bl->head = bl2->head;
+        bl->tail = bl2->tail;
+}
+static inline struct bio *bio_list_pop(struct bio_list *bl)
+{
+        struct bio *bio = bl->head;
+        if (bio) {
+                bl->head = bl->head->bi_next;
+                if (!bl->head)
+                        bl->tail = NULL;
+                bio->bi_next = NULL;
+        }
+        return bio;
+}
+static inline struct bio *bio_list_get(struct bio_list *bl)
+{
+        struct bio *bio = bl->head;
+        bl->head = bl->tail = NULL;
+        return bio;
+}
+#endif
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
new file mode 100644
index 000000000000..d3ec217847d6
--- /dev/null
+++ b/drivers/md/dm-bio-record.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef DM_BIO_RECORD_H
+#define DM_BIO_RECORD_H
+#include <linux/bio.h>
+/*
+ * There are lots of mutable fields in the bio struct that get
+ * changed by the lower levels of the block layer.  Some targets,
+ * such as multipath, may wish to resubmit a bio on error.  The
+ * functions in this file help the target record and restore the
+ * original bio state.
+ */
+struct dm_bio_details {
+        sector_t bi_sector;
+        struct block_device *bi_bdev;
+        unsigned int bi_size;
+        unsigned short bi_idx;
+        unsigned long bi_flags;
+};
+static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
+{
+        bd->bi_sector = bio->bi_sector;
+        bd->bi_bdev = bio->bi_bdev;
+        bd->bi_size = bio->bi_size;
+        bd->bi_idx = bio->bi_idx;
+        bd->bi_flags = bio->bi_flags;
+}
+static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
+{
+        bio->bi_sector = bd->bi_sector;
+        bio->bi_bdev = bd->bi_bdev;
+        bio->bi_size = bd->bi_size;
+        bio->bi_idx = bd->bi_idx;
+        bio->bi_flags = bd->bi_flags;
+}
+#endif
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
new file mode 100644
index 000000000000..77619a56e2bf
--- /dev/null
+++ b/drivers/md/dm-crypt.c
@@ -0,0 +1,977 @@
+/*
+ * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
+ * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/workqueue.h>
+#include <asm/atomic.h>
+#include <asm/scatterlist.h>
+#include <asm/page.h>
+#include "dm.h"
+#define PFX     "crypt: "
+/*
+ * per bio private data
+ */
+struct crypt_io {
+        struct dm_target *target;
+        struct bio *bio;
+        struct bio *first_clone;
+        struct work_struct work;
+        atomic_t pending;
+        int error;
+};
+/*
+ * context holding the current state of a multi-part conversion
+ */
+struct convert_context {
+        struct bio *bio_in;
+        struct bio *bio_out;
+        unsigned int offset_in;
+        unsigned int offset_out;
+        unsigned int idx_in;
+        unsigned int idx_out;
+        sector_t sector;
+        int write;
+};
+struct crypt_config;
+struct crypt_iv_operations {
+        int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
+                   const char *opts);
+        void (*dtr)(struct crypt_config *cc);
+        const char *(*status)(struct crypt_config *cc);
+        int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
+};
+/*
+ * Crypt: maps a linear range of a block device
+ * and encrypts / decrypts at the same time.
+ */
+struct crypt_config {
+        struct dm_dev *dev;
+        sector_t start;
+        /*
+         * pool for per bio private data and
+         * for encryption buffer pages
+         */
+        mempool_t *io_pool;
+        mempool_t *page_pool;
+        /*
+         * crypto related data
+         */
+        struct crypt_iv_operations *iv_gen_ops;
+        char *iv_mode;
+        void *iv_gen_private;
+        sector_t iv_offset;
+        unsigned int iv_size;
+        struct crypto_tfm *tfm;
+        unsigned int key_size;
+        u8 key[0];
+};
+#define MIN_IOS        256
+#define MIN_POOL_PAGES 32
+#define MIN_BIO_PAGES  8
+static kmem_cache_t *_crypt_io_pool;
+/*
+ * Mempool alloc and free functions for the page
+ */
+static void *mempool_alloc_page(unsigned int __nocast gfp_mask, void *data)
+{
+        return alloc_page(gfp_mask);
+}
+static void mempool_free_page(void *page, void *data)
+{
+        __free_page(page);
+}
+/*
+ * Different IV generation algorithms:
+ *
+ * plain: the initial vector is the 32-bit low-endian version of the sector
+ *        number, padded with zeros if neccessary.
+ *
+ * ess_iv: "encrypted sector|salt initial vector", the sector number is
+ *         encrypted with the bulk cipher using a salt as key. The salt
+ *         should be derived from the bulk cipher's key via hashing.
+ *
+ * plumb: unimplemented, see:
+ * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
+ */
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+{
+        memset(iv, 0, cc->iv_size);
+        *(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
+        return 0;
+}
+static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
+                              const char *opts)
+{
+        struct crypto_tfm *essiv_tfm;
+        struct crypto_tfm *hash_tfm;
+        struct scatterlist sg;
+        unsigned int saltsize;
+        u8 *salt;
+        if (opts == NULL) {
+                ti->error = PFX "Digest algorithm missing for ESSIV mode";
+                return -EINVAL;
+        }
+        /* Hash the cipher key with the given hash algorithm */
+        hash_tfm = crypto_alloc_tfm(opts, 0);
+        if (hash_tfm == NULL) {
+                ti->error = PFX "Error initializing ESSIV hash";
+                return -EINVAL;
+        }
+        if (crypto_tfm_alg_type(hash_tfm) != CRYPTO_ALG_TYPE_DIGEST) {
+                ti->error = PFX "Expected digest algorithm for ESSIV hash";
+                crypto_free_tfm(hash_tfm);
+                return -EINVAL;
+        }
+        saltsize = crypto_tfm_alg_digestsize(hash_tfm);
+        salt = kmalloc(saltsize, GFP_KERNEL);
+        if (salt == NULL) {
+                ti->error = PFX "Error kmallocing salt storage in ESSIV";
+                crypto_free_tfm(hash_tfm);
+                return -ENOMEM;
+        }
+        sg.page = virt_to_page(cc->key);
+        sg.offset = offset_in_page(cc->key);
+        sg.length = cc->key_size;
+        crypto_digest_digest(hash_tfm, &sg, 1, salt);
+        crypto_free_tfm(hash_tfm);
+        /* Setup the essiv_tfm with the given salt */
+        essiv_tfm = crypto_alloc_tfm(crypto_tfm_alg_name(cc->tfm),
+                                     CRYPTO_TFM_MODE_ECB);
+        if (essiv_tfm == NULL) {
+                ti->error = PFX "Error allocating crypto tfm for ESSIV";
+                kfree(salt);
+                return -EINVAL;
+        }
+        if (crypto_tfm_alg_blocksize(essiv_tfm)
+            != crypto_tfm_alg_ivsize(cc->tfm)) {
+                ti->error = PFX "Block size of ESSIV cipher does "
+                                "not match IV size of block cipher";
+                crypto_free_tfm(essiv_tfm);
+                kfree(salt);
+                return -EINVAL;
+        }
+        if (crypto_cipher_setkey(essiv_tfm, salt, saltsize) < 0) {
+                ti->error = PFX "Failed to set key for ESSIV cipher";
+                crypto_free_tfm(essiv_tfm);
+                kfree(salt);
+                return -EINVAL;
+        }
+        kfree(salt);
+        cc->iv_gen_private = (void *)essiv_tfm;
+        return 0;
+}
+static void crypt_iv_essiv_dtr(struct crypt_config *cc)
+{
+        crypto_free_tfm((struct crypto_tfm *)cc->iv_gen_private);
+        cc->iv_gen_private = NULL;
+}
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+{
+        struct scatterlist sg = { NULL, };
+        memset(iv, 0, cc->iv_size);
+        *(u64 *)iv = cpu_to_le64(sector);
+        sg.page = virt_to_page(iv);
+        sg.offset = offset_in_page(iv);
+        sg.length = cc->iv_size;
+        crypto_cipher_encrypt((struct crypto_tfm *)cc->iv_gen_private,
+                              &sg, &sg, cc->iv_size);
+        return 0;
+}
+static struct crypt_iv_operations crypt_iv_plain_ops = {
+        .generator = crypt_iv_plain_gen
+};
+static struct crypt_iv_operations crypt_iv_essiv_ops = {
+        .ctr       = crypt_iv_essiv_ctr,
+        .dtr       = crypt_iv_essiv_dtr,
+        .generator = crypt_iv_essiv_gen
+};
+static inline int
+crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
+                          struct scatterlist *in, unsigned int length,
+                          int write, sector_t sector)
+{
+        u8 iv[cc->iv_size];
+        int r;
+        if (cc->iv_gen_ops) {
+                r = cc->iv_gen_ops->generator(cc, iv, sector);
+                if (r < 0)
+                        return r;
+                if (write)
+                        r = crypto_cipher_encrypt_iv(cc->tfm, out, in, length, iv);
+                else
+                        r = crypto_cipher_decrypt_iv(cc->tfm, out, in, length, iv);
+        } else {
+                if (write)
+                        r = crypto_cipher_encrypt(cc->tfm, out, in, length);
+                else
+                        r = crypto_cipher_decrypt(cc->tfm, out, in, length);
+        }
+        return r;
+}
+static void
+crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx,
+                   struct bio *bio_out, struct bio *bio_in,
+                   sector_t sector, int write)
+{
+        ctx->bio_in = bio_in;
+        ctx->bio_out = bio_out;
+        ctx->offset_in = 0;
+        ctx->offset_out = 0;
+        ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
+        ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
+        ctx->sector = sector + cc->iv_offset;
+        ctx->write = write;
+}
+/*
+ * Encrypt / decrypt data from one bio to another one (can be the same one)
+ */
+static int crypt_convert(struct crypt_config *cc,
+                         struct convert_context *ctx)
+{
+        int r = 0;
+        while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
+              ctx->idx_out < ctx->bio_out->bi_vcnt) {
+                struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
+                struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
+                struct scatterlist sg_in = {
+                        .page = bv_in->bv_page,
+                        .offset = bv_in->bv_offset + ctx->offset_in,
+                        .length = 1 << SECTOR_SHIFT
+                };
+                struct scatterlist sg_out = {
+                        .page = bv_out->bv_page,
+                        .offset = bv_out->bv_offset + ctx->offset_out,
+                        .length = 1 << SECTOR_SHIFT
+                };
+                ctx->offset_in += sg_in.length;
+                if (ctx->offset_in >= bv_in->bv_len) {
+                        ctx->offset_in = 0;
+                        ctx->idx_in++;
+                }
+                ctx->offset_out += sg_out.length;
+                if (ctx->offset_out >= bv_out->bv_len) {
+                        ctx->offset_out = 0;
+                        ctx->idx_out++;
+                }
+                r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length,
+                                              ctx->write, ctx->sector);
+                if (r < 0)
+                        break;
+                ctx->sector++;
+        }
+        return r;
+}
+/*
+ * Generate a new unfragmented bio with the given size
+ * This should never violate the device limitations
+ * May return a smaller bio when running out of pages
+ */
+static struct bio *
+crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
+                   struct bio *base_bio, unsigned int *bio_vec_idx)
+{
+        struct bio *bio;
+        unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        int gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
+        unsigned long flags = current->flags;
+        unsigned int i;
+        /*
+         * Tell VM to act less aggressively and fail earlier.
+         * This is not necessary but increases throughput.
+         * FIXME: Is this really intelligent?
+         */
+        current->flags &= ~PF_MEMALLOC;
+        if (base_bio)
+                bio = bio_clone(base_bio, GFP_NOIO);
+        else
+                bio = bio_alloc(GFP_NOIO, nr_iovecs);
+        if (!bio) {
+                if (flags & PF_MEMALLOC)
+                        current->flags |= PF_MEMALLOC;
+                return NULL;
+        }
+        /* if the last bio was not complete, continue where that one ended */
+        bio->bi_idx = *bio_vec_idx;
+        bio->bi_vcnt = *bio_vec_idx;
+        bio->bi_size = 0;
+        bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+        /* bio->bi_idx pages have already been allocated */
+        size -= bio->bi_idx * PAGE_SIZE;
+        for(i = bio->bi_idx; i < nr_iovecs; i++) {
+                struct bio_vec *bv = bio_iovec_idx(bio, i);
+                bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask);
+                if (!bv->bv_page)
+                        break;
+                /*
+                 * if additional pages cannot be allocated without waiting,
+                 * return a partially allocated bio, the caller will then try
+                 * to allocate additional bios while submitting this partial bio
+                 */
+                if ((i - bio->bi_idx) == (MIN_BIO_PAGES - 1))
+                        gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
+                bv->bv_offset = 0;
+                if (size > PAGE_SIZE)
+                        bv->bv_len = PAGE_SIZE;
+                else
+                        bv->bv_len = size;
+                bio->bi_size += bv->bv_len;
+                bio->bi_vcnt++;
+                size -= bv->bv_len;
+        }
+        if (flags & PF_MEMALLOC)
+                current->flags |= PF_MEMALLOC;
+        if (!bio->bi_size) {
+                bio_put(bio);
+                return NULL;
+        }
+        /*
+         * Remember the last bio_vec allocated to be able
+         * to correctly continue after the splitting.
+         */
+        *bio_vec_idx = bio->bi_vcnt;
+        return bio;
+}
+static void crypt_free_buffer_pages(struct crypt_config *cc,
+                                    struct bio *bio, unsigned int bytes)
+{
+        unsigned int i, start, end;
+        struct bio_vec *bv;
+        /*
+         * This is ugly, but Jens Axboe thinks that using bi_idx in the
+         * endio function is too dangerous at the moment, so I calculate the
+         * correct position using bi_vcnt and bi_size.
+         * The bv_offset and bv_len fields might already be modified but we
+         * know that we always allocated whole pages.
+         * A fix to the bi_idx issue in the kernel is in the works, so
+         * we will hopefully be able to revert to the cleaner solution soon.
+         */
+        i = bio->bi_vcnt - 1;
+        bv = bio_iovec_idx(bio, i);
+        end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - bio->bi_size;
+        start = end - bytes;
+        start >>= PAGE_SHIFT;
+        if (!bio->bi_size)
+                end = bio->bi_vcnt;
+        else
+                end >>= PAGE_SHIFT;
+        for(i = start; i < end; i++) {
+                bv = bio_iovec_idx(bio, i);
+                BUG_ON(!bv->bv_page);
+                mempool_free(bv->bv_page, cc->page_pool);
+                bv->bv_page = NULL;
+        }
+}
+/*
+ * One of the bios was finished. Check for completion of
+ * the whole request and correctly clean up the buffer.
+ */
+static void dec_pending(struct crypt_io *io, int error)
+{
+        struct crypt_config *cc = (struct crypt_config *) io->target->private;
+        if (error < 0)
+                io->error = error;
+        if (!atomic_dec_and_test(&io->pending))
+                return;
+        if (io->first_clone)
+                bio_put(io->first_clone);
+        bio_endio(io->bio, io->bio->bi_size, io->error);
+        mempool_free(io, cc->io_pool);
+}
+/*
+ * kcryptd:
+ *
+ * Needed because it would be very unwise to do decryption in an
+ * interrupt context, so bios returning from read requests get
+ * queued here.
+ */
+static struct workqueue_struct *_kcryptd_workqueue;
+static void kcryptd_do_work(void *data)
+{
+        struct crypt_io *io = (struct crypt_io *) data;
+        struct crypt_config *cc = (struct crypt_config *) io->target->private;
+        struct convert_context ctx;
+        int r;
+        crypt_convert_init(cc, &ctx, io->bio, io->bio,
+                           io->bio->bi_sector - io->target->begin, 0);
+        r = crypt_convert(cc, &ctx);
+        dec_pending(io, r);
+}
+static void kcryptd_queue_io(struct crypt_io *io)
+{
+        INIT_WORK(&io->work, kcryptd_do_work, io);
+        queue_work(_kcryptd_workqueue, &io->work);
+}
+/*
+ * Decode key from its hex representation
+ */
+static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
+{
+        char buffer[3];
+        char *endp;
+        unsigned int i;
+        buffer[2] = '\0';
+        for(i = 0; i < size; i++) {
+                buffer[0] = *hex++;
+                buffer[1] = *hex++;
+                key[i] = (u8)simple_strtoul(buffer, &endp, 16);
+                if (endp != &buffer[2])
+                        return -EINVAL;
+        }
+        if (*hex != '\0')
+                return -EINVAL;
+        return 0;
+}
+/*
+ * Encode key into its hex representation
+ */
+static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
+{
+        unsigned int i;
+        for(i = 0; i < size; i++) {
+                sprintf(hex, "%02x", *key);
+                hex += 2;
+                key++;
+        }
+}
+/*
+ * Construct an encryption mapping:
+ * <cipher> <key> <iv_offset> <dev_path> <start>
+ */
+static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+        struct crypt_config *cc;
+        struct crypto_tfm *tfm;
+        char *tmp;
+        char *cipher;
+        char *chainmode;
+        char *ivmode;
+        char *ivopts;
+        unsigned int crypto_flags;
+        unsigned int key_size;
+        if (argc != 5) {
+                ti->error = PFX "Not enough arguments";
+                return -EINVAL;
+        }
+        tmp = argv[0];
+        cipher = strsep(&tmp, "-");
+        chainmode = strsep(&tmp, "-");
+        ivopts = strsep(&tmp, "-");
+        ivmode = strsep(&ivopts, ":");
+        if (tmp)
+                DMWARN(PFX "Unexpected additional cipher options");
+        key_size = strlen(argv[1]) >> 1;
+        cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
+        if (cc == NULL) {
+                ti->error =
+                        PFX "Cannot allocate transparent encryption context";
+                return -ENOMEM;
+        }
+        cc->key_size = key_size;
+        if ((!key_size && strcmp(argv[1], "-") != 0) ||
+            (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) {
+                ti->error = PFX "Error decoding key";
+                goto bad1;
+        }
+        /* Compatiblity mode for old dm-crypt cipher strings */
+        if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
+                chainmode = "cbc";
+                ivmode = "plain";
+        }
+        /* Choose crypto_flags according to chainmode */
+        if (strcmp(chainmode, "cbc") == 0)
+                crypto_flags = CRYPTO_TFM_MODE_CBC;
+        else if (strcmp(chainmode, "ecb") == 0)
+                crypto_flags = CRYPTO_TFM_MODE_ECB;
+        else {
+                ti->error = PFX "Unknown chaining mode";
+                goto bad1;
+        }
+        if (crypto_flags != CRYPTO_TFM_MODE_ECB && !ivmode) {
+                ti->error = PFX "This chaining mode requires an IV mechanism";
+                goto bad1;
+        }
+        tfm = crypto_alloc_tfm(cipher, crypto_flags);
+        if (!tfm) {
+                ti->error = PFX "Error allocating crypto tfm";
+                goto bad1;
+        }
+        if (crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER) {
+                ti->error = PFX "Expected cipher algorithm";
+                goto bad2;
+        }
+        cc->tfm = tfm;
+        /*
+         * Choose ivmode. Valid modes: "plain", "essiv:<esshash>".
+         * See comments at iv code
+         */
+        if (ivmode == NULL)
+                cc->iv_gen_ops = NULL;
+        else if (strcmp(ivmode, "plain") == 0)
+                cc->iv_gen_ops = &crypt_iv_plain_ops;
+        else if (strcmp(ivmode, "essiv") == 0)
+                cc->iv_gen_ops = &crypt_iv_essiv_ops;
+        else {
+                ti->error = PFX "Invalid IV mode";
+                goto bad2;
+        }
+        if (cc->iv_gen_ops && cc->iv_gen_ops->ctr &&
+            cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
+                goto bad2;
+        if (tfm->crt_cipher.cit_decrypt_iv && tfm->crt_cipher.cit_encrypt_iv)
+                /* at least a 64 bit sector number should fit in our buffer */
+                cc->iv_size = max(crypto_tfm_alg_ivsize(tfm),
+                                  (unsigned int)(sizeof(u64) / sizeof(u8)));
+        else {
+                cc->iv_size = 0;
+                if (cc->iv_gen_ops) {
+                        DMWARN(PFX "Selected cipher does not support IVs");
+                        if (cc->iv_gen_ops->dtr)
+                                cc->iv_gen_ops->dtr(cc);
+                        cc->iv_gen_ops = NULL;
+                }
+        }
+        cc->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
+                                     mempool_free_slab, _crypt_io_pool);
+        if (!cc->io_pool) {
+                ti->error = PFX "Cannot allocate crypt io mempool";
+                goto bad3;
+        }
+        cc->page_pool = mempool_create(MIN_POOL_PAGES, mempool_alloc_page,
+                                       mempool_free_page, NULL);
+        if (!cc->page_pool) {
+                ti->error = PFX "Cannot allocate page mempool";
+                goto bad4;
+        }
+        if (tfm->crt_cipher.cit_setkey(tfm, cc->key, key_size) < 0) {
+                ti->error = PFX "Error setting key";
+                goto bad5;
+        }
+        if (sscanf(argv[2], SECTOR_FORMAT, &cc->iv_offset) != 1) {
+                ti->error = PFX "Invalid iv_offset sector";
+                goto bad5;
+        }
+        if (sscanf(argv[4], SECTOR_FORMAT, &cc->start) != 1) {
+                ti->error = PFX "Invalid device sector";
+                goto bad5;
+        }
+        if (dm_get_device(ti, argv[3], cc->start, ti->len,
+                          dm_table_get_mode(ti->table), &cc->dev)) {
+                ti->error = PFX "Device lookup failed";
+                goto bad5;
+        }
+        if (ivmode && cc->iv_gen_ops) {
+                if (ivopts)
+                        *(ivopts - 1) = ':';
+                cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL);
+                if (!cc->iv_mode) {
+                        ti->error = PFX "Error kmallocing iv_mode string";
+                        goto bad5;
+                }
+                strcpy(cc->iv_mode, ivmode);
+        } else
+                cc->iv_mode = NULL;
+        ti->private = cc;
+        return 0;
+bad5:
+        mempool_destroy(cc->page_pool);
+bad4:
+        mempool_destroy(cc->io_pool);
+bad3:
+        if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
+                cc->iv_gen_ops->dtr(cc);
+bad2:
+        crypto_free_tfm(tfm);
+bad1:
+        kfree(cc);
+        return -EINVAL;
+}
+static void crypt_dtr(struct dm_target *ti)
+{
+        struct crypt_config *cc = (struct crypt_config *) ti->private;
+        mempool_destroy(cc->page_pool);
+        mempool_destroy(cc->io_pool);
+        if (cc->iv_mode)
+                kfree(cc->iv_mode);
+        if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
+                cc->iv_gen_ops->dtr(cc);
+        crypto_free_tfm(cc->tfm);
+        dm_put_device(ti, cc->dev);
+        kfree(cc);
+}
+static int crypt_endio(struct bio *bio, unsigned int done, int error)
+{
+        struct crypt_io *io = (struct crypt_io *) bio->bi_private;
+        struct crypt_config *cc = (struct crypt_config *) io->target->private;
+        if (bio_data_dir(bio) == WRITE) {
+                /*
+                 * free the processed pages, even if
+                 * it's only a partially completed write
+                 */
+                crypt_free_buffer_pages(cc, bio, done);
+        }
+        if (bio->bi_size)
+                return 1;
+        bio_put(bio);
+        /*
+         * successful reads are decrypted by the worker thread
+         */
+        if ((bio_data_dir(bio) == READ)
+            && bio_flagged(bio, BIO_UPTODATE)) {
+                kcryptd_queue_io(io);
+                return 0;
+        }
+        dec_pending(io, error);
+        return error;
+}
+static inline struct bio *
+crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio,
+            sector_t sector, unsigned int *bvec_idx,
+            struct convert_context *ctx)
+{
+        struct bio *clone;
+        if (bio_data_dir(bio) == WRITE) {
+                clone = crypt_alloc_buffer(cc, bio->bi_size,
+                                 io->first_clone, bvec_idx);
+                if (clone) {
+                        ctx->bio_out = clone;
+                        if (crypt_convert(cc, ctx) < 0) {
+                                crypt_free_buffer_pages(cc, clone,
+                                                        clone->bi_size);
+                                bio_put(clone);
+                                return NULL;
+                        }
+                }
+        } else {
+                /*
+                 * The block layer might modify the bvec array, so always
+                 * copy the required bvecs because we need the original
+                 * one in order to decrypt the whole bio data *afterwards*.
+                 */
+                clone = bio_alloc(GFP_NOIO, bio_segments(bio));
+                if (clone) {
+                        clone->bi_idx = 0;
+                        clone->bi_vcnt = bio_segments(bio);
+                        clone->bi_size = bio->bi_size;
+                        memcpy(clone->bi_io_vec, bio_iovec(bio),
+                               sizeof(struct bio_vec) * clone->bi_vcnt);
+                }
+        }
+        if (!clone)
+                return NULL;
+        clone->bi_private = io;
+        clone->bi_end_io = crypt_endio;
+        clone->bi_bdev = cc->dev->bdev;
+        clone->bi_sector = cc->start + sector;
+        clone->bi_rw = bio->bi_rw;
+        return clone;
+}
+static int crypt_map(struct dm_target *ti, struct bio *bio,
+                     union map_info *map_context)
+{
+        struct crypt_config *cc = (struct crypt_config *) ti->private;
+        struct crypt_io *io = mempool_alloc(cc->io_pool, GFP_NOIO);
+        struct convert_context ctx;
+        struct bio *clone;
+        unsigned int remaining = bio->bi_size;
+        sector_t sector = bio->bi_sector - ti->begin;
+        unsigned int bvec_idx = 0;
+        io->target = ti;
+        io->bio = bio;
+        io->first_clone = NULL;
+        io->error = 0;
+        atomic_set(&io->pending, 1); /* hold a reference */
+        if (bio_data_dir(bio) == WRITE)
+                crypt_convert_init(cc, &ctx, NULL, bio, sector, 1);
+        /*
+         * The allocated buffers can be smaller than the whole bio,
+         * so repeat the whole process until all the data can be handled.
+         */
+        while (remaining) {
+                clone = crypt_clone(cc, io, bio, sector, &bvec_idx, &ctx);
+                if (!clone)
+                        goto cleanup;
+                if (!io->first_clone) {
+                        /*
+                         * hold a reference to the first clone, because it
+                         * holds the bio_vec array and that can't be freed
+                         * before all other clones are released
+                         */
+                        bio_get(clone);
+                        io->first_clone = clone;
+                }
+                atomic_inc(&io->pending);
+                remaining -= clone->bi_size;
+                sector += bio_sectors(clone);
+                generic_make_request(clone);
+                /* out of memory -> run queues */
+                if (remaining)
+                        blk_congestion_wait(bio_data_dir(clone), HZ/100);
+        }
+        /* drop reference, clones could have returned before we reach this */
+        dec_pending(io, 0);
+        return 0;
+cleanup:
+        if (io->first_clone) {
+                dec_pending(io, -ENOMEM);
+                return 0;
+        }
+        /* if no bio has been dispatched yet, we can directly return the error */
+        mempool_free(io, cc->io_pool);
+        return -ENOMEM;
+}
+static int crypt_status(struct dm_target *ti, status_type_t type,
+                        char *result, unsigned int maxlen)
+{
+        struct crypt_config *cc = (struct crypt_config *) ti->private;
+        const char *cipher;
+        const char *chainmode = NULL;
+        unsigned int sz = 0;
+        switch (type) {
+        case STATUSTYPE_INFO:
+                result[0] = '\0';
+                break;
+        case STATUSTYPE_TABLE:
+                cipher = crypto_tfm_alg_name(cc->tfm);
+                switch(cc->tfm->crt_cipher.cit_mode) {
+                case CRYPTO_TFM_MODE_CBC:
+                        chainmode = "cbc";
+                        break;
+                case CRYPTO_TFM_MODE_ECB:
+                        chainmode = "ecb";
+                        break;
+                default:
+                        BUG();
+                }
+                if (cc->iv_mode)
+                        DMEMIT("%s-%s-%s ", cipher, chainmode, cc->iv_mode);
+                else
+                        DMEMIT("%s-%s ", cipher, chainmode);
+                if (cc->key_size > 0) {
+                        if ((maxlen - sz) < ((cc->key_size << 1) + 1))
+                                return -ENOMEM;
+                        crypt_encode_key(result + sz, cc->key, cc->key_size);
+                        sz += cc->key_size << 1;
+                } else {
+                        if (sz >= maxlen)
+                                return -ENOMEM;
+                        result[sz++] = '-';
+                }
+                DMEMIT(" " SECTOR_FORMAT " %s " SECTOR_FORMAT,
+                       cc->iv_offset, cc->dev->name, cc->start);
+                break;
+        }
+        return 0;
+}
+static struct target_type crypt_target = {
+        .name   = "crypt",
+        .version= {1, 1, 0},
+        .module = THIS_MODULE,
+        .ctr    = crypt_ctr,
+        .dtr    = crypt_dtr,
+        .map    = crypt_map,
+        .status = crypt_status,
+};
+static int __init dm_crypt_init(void)
+{
+        int r;
+        _crypt_io_pool = kmem_cache_create("dm-crypt_io",
+                                           sizeof(struct crypt_io),
+                                           0, 0, NULL, NULL);
+        if (!_crypt_io_pool)
+                return -ENOMEM;
+        _kcryptd_workqueue = create_workqueue("kcryptd");
+        if (!_kcryptd_workqueue) {
+                r = -ENOMEM;
+                DMERR(PFX "couldn't create kcryptd");
+                goto bad1;
+        }
+        r = dm_register_target(&crypt_target);
+        if (r < 0) {
+                DMERR(PFX "register failed %d", r);
+                goto bad2;
+        }
+        return 0;
+bad2:
+        destroy_workqueue(_kcryptd_workqueue);
+bad1:
+        kmem_cache_destroy(_crypt_io_pool);
+        return r;
+}
+static void __exit dm_crypt_exit(void)
+{
+        int r = dm_unregister_target(&crypt_target);
+        if (r < 0)
+                DMERR(PFX "unregister failed %d", r);
+        destroy_workqueue(_kcryptd_workqueue);
+        kmem_cache_destroy(_crypt_io_pool);
+}
+module_init(dm_crypt_init);
+module_exit(dm_crypt_exit);
+MODULE_AUTHOR("Christophe Saout <christophe@saout.de>");
+MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c
new file mode 100644
index 000000000000..700658664594
--- /dev/null
+++ b/drivers/md/dm-emc.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Multipath support for EMC CLARiiON AX/CX-series hardware.
+ */
+#include "dm.h"
+#include "dm-hw-handler.h"
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+struct emc_handler {
+        spinlock_t lock;
+        /* Whether we should send the short trespass command (FC-series)
+         * or the long version (default for AX/CX CLARiiON arrays). */
+        unsigned short_trespass;
+        /* Whether or not to honor SCSI reservations when initiating a
+         * switch-over. Default: Don't. */
+        unsigned hr;
+        unsigned char sense[SCSI_SENSE_BUFFERSIZE];
+};
+#define TRESPASS_PAGE 0x22
+#define EMC_FAILOVER_TIMEOUT (60 * HZ)
+/* Code borrowed from dm-lsi-rdac by Mike Christie */
+static inline void free_bio(struct bio *bio)
+{
+        __free_page(bio->bi_io_vec[0].bv_page);
+        bio_put(bio);
+}
+static int emc_endio(struct bio *bio, unsigned int bytes_done, int error)
+{
+        struct path *path = bio->bi_private;
+        if (bio->bi_size)
+                return 1;
+        /* We also need to look at the sense keys here whether or not to
+         * switch to the next PG etc.
+         *
+         * For now simple logic: either it works or it doesn't.
+         */
+        if (error)
+                dm_pg_init_complete(path, MP_FAIL_PATH);
+        else
+                dm_pg_init_complete(path, 0);
+        /* request is freed in block layer */
+        free_bio(bio);
+        return 0;
+}
+static struct bio *get_failover_bio(struct path *path, unsigned data_size)
+{
+        struct bio *bio;
+        struct page *page;
+        bio = bio_alloc(GFP_ATOMIC, 1);
+        if (!bio) {
+                DMERR("dm-emc: get_failover_bio: bio_alloc() failed.");
+                return NULL;
+        }
+        bio->bi_rw |= (1 << BIO_RW);
+        bio->bi_bdev = path->dev->bdev;
+        bio->bi_sector = 0;
+        bio->bi_private = path;
+        bio->bi_end_io = emc_endio;
+        page = alloc_page(GFP_ATOMIC);
+        if (!page) {
+                DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
+                bio_put(bio);
+                return NULL;
+        }
+        if (bio_add_page(bio, page, data_size, 0) != data_size) {
+                DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
+                __free_page(page);
+                bio_put(bio);
+                return NULL;
+        }
+        return bio;
+}
+static struct request *get_failover_req(struct emc_handler *h,
+                                        struct bio *bio, struct path *path)
+{
+        struct request *rq;
+        struct block_device *bdev = bio->bi_bdev;
+        struct request_queue *q = bdev_get_queue(bdev);
+        /* FIXME: Figure out why it fails with GFP_ATOMIC. */
+        rq = blk_get_request(q, WRITE, __GFP_WAIT);
+        if (!rq) {
+                DMERR("dm-emc: get_failover_req: blk_get_request failed");
+                return NULL;
+        }
+        rq->bio = rq->biotail = bio;
+        blk_rq_bio_prep(q, rq, bio);
+        rq->rq_disk = bdev->bd_contains->bd_disk;
+        /* bio backed don't set data */
+        rq->buffer = rq->data = NULL;
+        /* rq data_len used for pc cmd's request_bufflen */
+        rq->data_len = bio->bi_size;
+        rq->sense = h->sense;
+        memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
+        rq->sense_len = 0;
+        memset(&rq->cmd, 0, BLK_MAX_CDB);
+        rq->timeout = EMC_FAILOVER_TIMEOUT;
+        rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE);
+        return rq;
+}
+static struct request *emc_trespass_get(struct emc_handler *h,
+                                        struct path *path)
+{
+        struct bio *bio;
+        struct request *rq;
+        unsigned char *page22;
+        unsigned char long_trespass_pg[] = {
+                0, 0, 0, 0,
+                TRESPASS_PAGE,        /* Page code */
+                0x09,                 /* Page length - 2 */
+                h->hr ? 0x01 : 0x81,  /* Trespass code + Honor reservation bit */
+                0xff, 0xff,           /* Trespass target */
+                0, 0, 0, 0, 0, 0      /* Reserved bytes / unknown */
+                };
+        unsigned char short_trespass_pg[] = {
+                0, 0, 0, 0,
+                TRESPASS_PAGE,        /* Page code */
+                0x02,                 /* Page length - 2 */
+                h->hr ? 0x01 : 0x81,  /* Trespass code + Honor reservation bit */
+                0xff,                 /* Trespass target */
+                };
+        unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) :
+                                sizeof(long_trespass_pg);
+        /* get bio backing */
+        if (data_size > PAGE_SIZE)
+                /* this should never happen */
+                return NULL;
+        bio = get_failover_bio(path, data_size);
+        if (!bio) {
+                DMERR("dm-emc: emc_trespass_get: no bio");
+                return NULL;
+        }
+        page22 = (unsigned char *)bio_data(bio);
+        memset(page22, 0, data_size);
+        memcpy(page22, h->short_trespass ?
+                short_trespass_pg : long_trespass_pg, data_size);
+        /* get request for block layer packet command */
+        rq = get_failover_req(h, bio, path);
+        if (!rq) {
+                DMERR("dm-emc: emc_trespass_get: no rq");
+                free_bio(bio);
+                return NULL;
+        }
+        /* Prepare the command. */
+        rq->cmd[0] = MODE_SELECT;
+        rq->cmd[1] = 0x10;
+        rq->cmd[4] = data_size;
+        rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
+        return rq;
+}
+static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
+                        struct path *path)
+{
+        struct request *rq;
+        struct request_queue *q = bdev_get_queue(path->dev->bdev);
+        /*
+         * We can either blindly init the pg (then look at the sense),
+         * or we can send some commands to get the state here (then
+         * possibly send the fo cmnd), or we can also have the
+         * initial state passed into us and then get an update here.
+         */
+        if (!q) {
+                DMINFO("dm-emc: emc_pg_init: no queue");
+                goto fail_path;
+        }
+        /* FIXME: The request should be pre-allocated. */
+        rq = emc_trespass_get(hwh->context, path);
+        if (!rq) {
+                DMERR("dm-emc: emc_pg_init: no rq");
+                goto fail_path;
+        }
+        DMINFO("dm-emc: emc_pg_init: sending switch-over command");
+        elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
+        return;
+fail_path:
+        dm_pg_init_complete(path, MP_FAIL_PATH);
+}
+static struct emc_handler *alloc_emc_handler(void)
+{
+        struct emc_handler *h = kmalloc(sizeof(*h), GFP_KERNEL);
+        if (h)
+                spin_lock_init(&h->lock);
+        return h;
+}
+static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
+{
+        struct emc_handler *h;
+        unsigned hr, short_trespass;
+        if (argc == 0) {
+                /* No arguments: use defaults */
+                hr = 0;
+                short_trespass = 0;
+        } else if (argc != 2) {
+                DMWARN("dm-emc hwhandler: incorrect number of arguments");
+                return -EINVAL;
+        } else {
+                if ((sscanf(argv[0], "%u", &short_trespass) != 1)
+                        || (short_trespass > 1)) {
+                        DMWARN("dm-emc: invalid trespass mode selected");
+                        return -EINVAL;
+                }
+                if ((sscanf(argv[1], "%u", &hr) != 1)
+                        || (hr > 1)) {
+                        DMWARN("dm-emc: invalid honor reservation flag selected");
+                        return -EINVAL;
+                }
+        }
+        h = alloc_emc_handler();
+        if (!h)
+                return -ENOMEM;
+        memset(h, 0, sizeof(*h));
+        hwh->context = h;
+        if ((h->short_trespass = short_trespass))
+                DMWARN("dm-emc: short trespass command will be send");
+        else
+                DMWARN("dm-emc: long trespass command will be send");
+        if ((h->hr = hr))
+                DMWARN("dm-emc: honor reservation bit will be set");
+        else
+                DMWARN("dm-emc: honor reservation bit will not be set (default)");
+        return 0;
+}
+static void emc_destroy(struct hw_handler *hwh)
+{
+        struct emc_handler *h = (struct emc_handler *) hwh->context;
+        kfree(h);
+        hwh->context = NULL;
+}
+static unsigned emc_error(struct hw_handler *hwh, struct bio *bio)
+{
+        /* FIXME: Patch from axboe still missing */
+#if 0
+        int sense;
+        if (bio->bi_error & BIO_SENSE) {
+                sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */
+                if (sense == 0x020403) {
+                        /* LUN Not Ready - Manual Intervention Required
+                         * indicates this is a passive path.
+                         *
+                         * FIXME: However, if this is seen and EVPD C0
+                         * indicates that this is due to a NDU in
+                         * progress, we should set FAIL_PATH too.
+                         * This indicates we might have to do a SCSI
+                         * inquiry in the end_io path. Ugh. */
+                        return MP_BYPASS_PG | MP_RETRY_IO;
+                } else if (sense == 0x052501) {
+                        /* An array based copy is in progress. Do not
+                         * fail the path, do not bypass to another PG,
+                         * do not retry. Fail the IO immediately.
+                         * (Actually this is the same conclusion as in
+                         * the default handler, but lets make sure.) */
+                        return 0;
+                } else if (sense == 0x062900) {
+                        /* Unit Attention Code. This is the first IO
+                         * to the new path, so just retry. */
+                        return MP_RETRY_IO;
+                }
+        }
+#endif
+        /* Try default handler */
+        return dm_scsi_err_handler(hwh, bio);
+}
+static struct hw_handler_type emc_hwh = {
+        .name = "emc",
+        .module = THIS_MODULE,
+        .create = emc_create,
+        .destroy = emc_destroy,
+        .pg_init = emc_pg_init,
+        .error = emc_error,
+};
+static int __init dm_emc_init(void)
+{
+        int r = dm_register_hw_handler(&emc_hwh);
+        if (r < 0)
+                DMERR("emc: register failed %d", r);
+        DMINFO("dm-emc version 0.0.3 loaded");
+        return r;
+}
+static void __exit dm_emc_exit(void)
+{
+        int r = dm_unregister_hw_handler(&emc_hwh);
+        if (r < 0)
+                DMERR("emc: unregister failed %d", r);
+}
+module_init(dm_emc_init);
+module_exit(dm_emc_exit);
+MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath");
+MODULE_AUTHOR("Lars Marowsky-Bree <lmb@suse.de>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
new file mode 100644
index 000000000000..17212b4201a1
--- /dev/null
+++ b/drivers/md/dm-exception-store.c
@@ -0,0 +1,648 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm.h"
+#include "dm-snap.h"
+#include "dm-io.h"
+#include "kcopyd.h"
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+/*-----------------------------------------------------------------
+ * Persistent snapshots, by persistent we mean that the snapshot
+ * will survive a reboot.
+ *---------------------------------------------------------------*/
+/*
+ * We need to store a record of which parts of the origin have
+ * been copied to the snapshot device.  The snapshot code
+ * requires that we copy exception chunks to chunk aligned areas
+ * of the COW store.  It makes sense therefore, to store the
+ * metadata in chunk size blocks.
+ *
+ * There is no backward or forward compatibility implemented,
+ * snapshots with different disk versions than the kernel will
+ * not be usable.  It is expected that "lvcreate" will blank out
+ * the start of a fresh COW device before calling the snapshot
+ * constructor.
+ *
+ * The first chunk of the COW device just contains the header.
+ * After this there is a chunk filled with exception metadata,
+ * followed by as many exception chunks as can fit in the
+ * metadata areas.
+ *
+ * All on disk structures are in little-endian format.  The end
+ * of the exceptions info is indicated by an exception with a
+ * new_chunk of 0, which is invalid since it would point to the
+ * header chunk.
+ */
+/*
+ * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
+ */
+#define SNAP_MAGIC 0x70416e53
+/*
+ * The on-disk version of the metadata.
+ */
+#define SNAPSHOT_DISK_VERSION 1
+struct disk_header {
+        uint32_t magic;
+        /*
+         * Is this snapshot valid.  There is no way of recovering
+         * an invalid snapshot.
+         */
+        uint32_t valid;
+        /*
+         * Simple, incrementing version. no backward
+         * compatibility.
+         */
+        uint32_t version;
+        /* In sectors */
+        uint32_t chunk_size;
+};
+struct disk_exception {
+        uint64_t old_chunk;
+        uint64_t new_chunk;
+};
+struct commit_callback {
+        void (*callback)(void *, int success);
+        void *context;
+};
+/*
+ * The top level structure for a persistent exception store.
+ */
+struct pstore {
+        struct dm_snapshot *snap;       /* up pointer to my snapshot */
+        int version;
+        int valid;
+        uint32_t chunk_size;
+        uint32_t exceptions_per_area;
+        /*
+         * Now that we have an asynchronous kcopyd there is no
+         * need for large chunk sizes, so it wont hurt to have a
+         * whole chunks worth of metadata in memory at once.
+         */
+        void *area;
+        /*
+         * Used to keep track of which metadata area the data in
+         * 'chunk' refers to.
+         */
+        uint32_t current_area;
+        /*
+         * The next free chunk for an exception.
+         */
+        uint32_t next_free;
+        /*
+         * The index of next free exception in the current
+         * metadata area.
+         */
+        uint32_t current_committed;
+        atomic_t pending_count;
+        uint32_t callback_count;
+        struct commit_callback *callbacks;
+};
+static inline unsigned int sectors_to_pages(unsigned int sectors)
+{
+        return sectors / (PAGE_SIZE >> 9);
+}
+static int alloc_area(struct pstore *ps)
+{
+        int r = -ENOMEM;
+        size_t len;
+        len = ps->chunk_size << SECTOR_SHIFT;
+        /*
+         * Allocate the chunk_size block of memory that will hold
+         * a single metadata area.
+         */
+        ps->area = vmalloc(len);
+        if (!ps->area)
+                return r;
+        return 0;
+}
+static void free_area(struct pstore *ps)
+{
+        vfree(ps->area);
+}
+/*
+ * Read or write a chunk aligned and sized block of data from a device.
+ */
+static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
+{
+        struct io_region where;
+        unsigned long bits;
+        where.bdev = ps->snap->cow->bdev;
+        where.sector = ps->chunk_size * chunk;
+        where.count = ps->chunk_size;
+        return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
+}
+/*
+ * Read or write a metadata area.  Remembering to skip the first
+ * chunk which holds the header.
+ */
+static int area_io(struct pstore *ps, uint32_t area, int rw)
+{
+        int r;
+        uint32_t chunk;
+        /* convert a metadata area index to a chunk index */
+        chunk = 1 + ((ps->exceptions_per_area + 1) * area);
+        r = chunk_io(ps, chunk, rw);
+        if (r)
+                return r;
+        ps->current_area = area;
+        return 0;
+}
+static int zero_area(struct pstore *ps, uint32_t area)
+{
+        memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
+        return area_io(ps, area, WRITE);
+}
+static int read_header(struct pstore *ps, int *new_snapshot)
+{
+        int r;
+        struct disk_header *dh;
+        r = chunk_io(ps, 0, READ);
+        if (r)
+                return r;
+        dh = (struct disk_header *) ps->area;
+        if (le32_to_cpu(dh->magic) == 0) {
+                *new_snapshot = 1;
+        } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
+                *new_snapshot = 0;
+                ps->valid = le32_to_cpu(dh->valid);
+                ps->version = le32_to_cpu(dh->version);
+                ps->chunk_size = le32_to_cpu(dh->chunk_size);
+        } else {
+                DMWARN("Invalid/corrupt snapshot");
+                r = -ENXIO;
+        }
+        return r;
+}
+static int write_header(struct pstore *ps)
+{
+        struct disk_header *dh;
+        memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
+        dh = (struct disk_header *) ps->area;
+        dh->magic = cpu_to_le32(SNAP_MAGIC);
+        dh->valid = cpu_to_le32(ps->valid);
+        dh->version = cpu_to_le32(ps->version);
+        dh->chunk_size = cpu_to_le32(ps->chunk_size);
+        return chunk_io(ps, 0, WRITE);
+}
+/*
+ * Access functions for the disk exceptions, these do the endian conversions.
+ */
+static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
+{
+        if (index >= ps->exceptions_per_area)
+                return NULL;
+        return ((struct disk_exception *) ps->area) + index;
+}
+static int read_exception(struct pstore *ps,
+                          uint32_t index, struct disk_exception *result)
+{
+        struct disk_exception *e;
+        e = get_exception(ps, index);
+        if (!e)
+                return -EINVAL;
+        /* copy it */
+        result->old_chunk = le64_to_cpu(e->old_chunk);
+        result->new_chunk = le64_to_cpu(e->new_chunk);
+        return 0;
+}
+static int write_exception(struct pstore *ps,
+                           uint32_t index, struct disk_exception *de)
+{
+        struct disk_exception *e;
+        e = get_exception(ps, index);
+        if (!e)
+                return -EINVAL;
+        /* copy it */
+        e->old_chunk = cpu_to_le64(de->old_chunk);
+        e->new_chunk = cpu_to_le64(de->new_chunk);
+        return 0;
+}
+/*
+ * Registers the exceptions that are present in the current area.
+ * 'full' is filled in to indicate if the area has been
+ * filled.
+ */
+static int insert_exceptions(struct pstore *ps, int *full)
+{
+        int r;
+        unsigned int i;
+        struct disk_exception de;
+        /* presume the area is full */
+        *full = 1;
+        for (i = 0; i < ps->exceptions_per_area; i++) {
+                r = read_exception(ps, i, &de);
+                if (r)
+                        return r;
+                /*
+                 * If the new_chunk is pointing at the start of
+                 * the COW device, where the first metadata area
+                 * is we know that we've hit the end of the
+                 * exceptions.  Therefore the area is not full.
+                 */
+                if (de.new_chunk == 0LL) {
+                        ps->current_committed = i;
+                        *full = 0;
+                        break;
+                }
+                /*
+                 * Keep track of the start of the free chunks.
+                 */
+                if (ps->next_free <= de.new_chunk)
+                        ps->next_free = de.new_chunk + 1;
+                /*
+                 * Otherwise we add the exception to the snapshot.
+                 */
+                r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
+                if (r)
+                        return r;
+        }
+        return 0;
+}
+static int read_exceptions(struct pstore *ps)
+{
+        uint32_t area;
+        int r, full = 1;
+        /*
+         * Keeping reading chunks and inserting exceptions until
+         * we find a partially full area.
+         */
+        for (area = 0; full; area++) {
+                r = area_io(ps, area, READ);
+                if (r)
+                        return r;
+                r = insert_exceptions(ps, &full);
+                if (r)
+                        return r;
+        }
+        return 0;
+}
+static inline struct pstore *get_info(struct exception_store *store)
+{
+        return (struct pstore *) store->context;
+}
+static void persistent_fraction_full(struct exception_store *store,
+                                     sector_t *numerator, sector_t *denominator)
+{
+        *numerator = get_info(store)->next_free * store->snap->chunk_size;
+        *denominator = get_dev_size(store->snap->cow->bdev);
+}
+static void persistent_destroy(struct exception_store *store)
+{
+        struct pstore *ps = get_info(store);
+        dm_io_put(sectors_to_pages(ps->chunk_size));
+        vfree(ps->callbacks);
+        free_area(ps);
+        kfree(ps);
+}
+static int persistent_read_metadata(struct exception_store *store)
+{
+        int r, new_snapshot;
+        struct pstore *ps = get_info(store);
+        /*
+         * Read the snapshot header.
+         */
+        r = read_header(ps, &new_snapshot);
+        if (r)
+                return r;
+        /*
+         * Do we need to setup a new snapshot ?
+         */
+        if (new_snapshot) {
+                r = write_header(ps);
+                if (r) {
+                        DMWARN("write_header failed");
+                        return r;
+                }
+                r = zero_area(ps, 0);
+                if (r) {
+                        DMWARN("zero_area(0) failed");
+                        return r;
+                }
+        } else {
+                /*
+                 * Sanity checks.
+                 */
+                if (!ps->valid) {
+                        DMWARN("snapshot is marked invalid");
+                        return -EINVAL;
+                }
+                if (ps->version != SNAPSHOT_DISK_VERSION) {
+                        DMWARN("unable to handle snapshot disk version %d",
+                               ps->version);
+                        return -EINVAL;
+                }
+                /*
+                 * Read the metadata.
+                 */
+                r = read_exceptions(ps);
+                if (r)
+                        return r;
+        }
+        return 0;
+}
+static int persistent_prepare(struct exception_store *store,
+                              struct exception *e)
+{
+        struct pstore *ps = get_info(store);
+        uint32_t stride;
+        sector_t size = get_dev_size(store->snap->cow->bdev);
+        /* Is there enough room ? */
+        if (size < ((ps->next_free + 1) * store->snap->chunk_size))
+                return -ENOSPC;
+        e->new_chunk = ps->next_free;
+        /*
+         * Move onto the next free pending, making sure to take
+         * into account the location of the metadata chunks.
+         */
+        stride = (ps->exceptions_per_area + 1);
+        if ((++ps->next_free % stride) == 1)
+                ps->next_free++;
+        atomic_inc(&ps->pending_count);
+        return 0;
+}
+static void persistent_commit(struct exception_store *store,
+                              struct exception *e,
+                              void (*callback) (void *, int success),
+                              void *callback_context)
+{
+        int r;
+        unsigned int i;
+        struct pstore *ps = get_info(store);
+        struct disk_exception de;
+        struct commit_callback *cb;
+        de.old_chunk = e->old_chunk;
+        de.new_chunk = e->new_chunk;
+        write_exception(ps, ps->current_committed++, &de);
+        /*
+         * Add the callback to the back of the array.  This code
+         * is the only place where the callback array is
+         * manipulated, and we know that it will never be called
+         * multiple times concurrently.
+         */
+        cb = ps->callbacks + ps->callback_count++;
+        cb->callback = callback;
+        cb->context = callback_context;
+        /*
+         * If there are no more exceptions in flight, or we have
+         * filled this metadata area we commit the exceptions to
+         * disk.
+         */
+        if (atomic_dec_and_test(&ps->pending_count) ||
+            (ps->current_committed == ps->exceptions_per_area)) {
+                r = area_io(ps, ps->current_area, WRITE);
+                if (r)
+                        ps->valid = 0;
+                for (i = 0; i < ps->callback_count; i++) {
+                        cb = ps->callbacks + i;
+                        cb->callback(cb->context, r == 0 ? 1 : 0);
+                }
+                ps->callback_count = 0;
+        }
+        /*
+         * Have we completely filled the current area ?
+         */
+        if (ps->current_committed == ps->exceptions_per_area) {
+                ps->current_committed = 0;
+                r = zero_area(ps, ps->current_area + 1);
+                if (r)
+                        ps->valid = 0;
+        }
+}
+static void persistent_drop(struct exception_store *store)
+{
+        struct pstore *ps = get_info(store);
+        ps->valid = 0;
+        if (write_header(ps))
+                DMWARN("write header failed");
+}
+int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
+{
+        int r;
+        struct pstore *ps;
+        r = dm_io_get(sectors_to_pages(chunk_size));
+        if (r)
+                return r;
+        /* allocate the pstore */
+        ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+        if (!ps) {
+                r = -ENOMEM;
+                goto bad;
+        }
+        ps->snap = store->snap;
+        ps->valid = 1;
+        ps->version = SNAPSHOT_DISK_VERSION;
+        ps->chunk_size = chunk_size;
+        ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
+            sizeof(struct disk_exception);
+        ps->next_free = 2;      /* skipping the header and first area */
+        ps->current_committed = 0;
+        r = alloc_area(ps);
+        if (r)
+                goto bad;
+        /*
+         * Allocate space for all the callbacks.
+         */
+        ps->callback_count = 0;
+        atomic_set(&ps->pending_count, 0);
+        ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
+                                   sizeof(*ps->callbacks));
+        if (!ps->callbacks) {
+                r = -ENOMEM;
+                goto bad;
+        }
+        store->destroy = persistent_destroy;
+        store->read_metadata = persistent_read_metadata;
+        store->prepare_exception = persistent_prepare;
+        store->commit_exception = persistent_commit;
+        store->drop_snapshot = persistent_drop;
+        store->fraction_full = persistent_fraction_full;
+        store->context = ps;
+        return 0;
+      bad:
+        dm_io_put(sectors_to_pages(chunk_size));
+        if (ps) {
+                if (ps->area)
+                        free_area(ps);
+                kfree(ps);
+        }
+        return r;
+}
+/*-----------------------------------------------------------------
+ * Implementation of the store for non-persistent snapshots.
+ *---------------------------------------------------------------*/
+struct transient_c {
+        sector_t next_free;
+};
+static void transient_destroy(struct exception_store *store)
+{
+        kfree(store->context);
+}
+static int transient_read_metadata(struct exception_store *store)
+{
+        return 0;
+}
+static int transient_prepare(struct exception_store *store, struct exception *e)
+{
+        struct transient_c *tc = (struct transient_c *) store->context;
+        sector_t size = get_dev_size(store->snap->cow->bdev);
+        if (size < (tc->next_free + store->snap->chunk_size))
+                return -1;
+        e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
+        tc->next_free += store->snap->chunk_size;
+        return 0;
+}
+static void transient_commit(struct exception_store *store,
+                      struct exception *e,
+                      void (*callback) (void *, int success),
+                      void *callback_context)
+{
+        /* Just succeed */
+        callback(callback_context, 1);
+}
+static void transient_fraction_full(struct exception_store *store,
+                                    sector_t *numerator, sector_t *denominator)
+{
+        *numerator = ((struct transient_c *) store->context)->next_free;
+        *denominator = get_dev_size(store->snap->cow->bdev);
+}
+int dm_create_transient(struct exception_store *store,
+                        struct dm_snapshot *s, int blocksize)
+{
+        struct transient_c *tc;
+        memset(store, 0, sizeof(*store));
+        store->destroy = transient_destroy;
+        store->read_metadata = transient_read_metadata;
+        store->prepare_exception = transient_prepare;
+        store->commit_exception = transient_commit;
+        store->fraction_full = transient_fraction_full;
+        store->snap = s;
+        tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
+        if (!tc)
+                return -ENOMEM;
+        tc->next_free = 0;
+        store->context = tc;
+        return 0;
+}
diff --git a/drivers/md/dm-hw-handler.c b/drivers/md/dm-hw-handler.c
new file mode 100644
index 000000000000..ae63772e44c9
--- /dev/null
+++ b/drivers/md/dm-hw-handler.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Multipath hardware handler registration.
+ */
+#include "dm.h"
+#include "dm-hw-handler.h"
+#include <linux/slab.h>
+struct hwh_internal {
+        struct hw_handler_type hwht;
+        struct list_head list;
+        long use;
+};
+#define hwht_to_hwhi(__hwht) container_of((__hwht), struct hwh_internal, hwht)
+static LIST_HEAD(_hw_handlers);
+static DECLARE_RWSEM(_hwh_lock);
+struct hwh_internal *__find_hw_handler_type(const char *name)
+{
+        struct hwh_internal *hwhi;
+        list_for_each_entry(hwhi, &_hw_handlers, list) {
+                if (!strcmp(name, hwhi->hwht.name))
+                        return hwhi;
+        }
+        return NULL;
+}
+static struct hwh_internal *get_hw_handler(const char *name)
+{
+        struct hwh_internal *hwhi;
+        down_read(&_hwh_lock);
+        hwhi = __find_hw_handler_type(name);
+        if (hwhi) {
+                if ((hwhi->use == 0) && !try_module_get(hwhi->hwht.module))
+                        hwhi = NULL;
+                else
+                        hwhi->use++;
+        }
+        up_read(&_hwh_lock);
+        return hwhi;
+}
+struct hw_handler_type *dm_get_hw_handler(const char *name)
+{
+        struct hwh_internal *hwhi;
+        if (!name)
+                return NULL;
+        hwhi = get_hw_handler(name);
+        if (!hwhi) {
+                request_module("dm-%s", name);
+                hwhi = get_hw_handler(name);
+        }
+        return hwhi ? &hwhi->hwht : NULL;
+}
+void dm_put_hw_handler(struct hw_handler_type *hwht)
+{
+        struct hwh_internal *hwhi;
+        if (!hwht)
+                return;
+        down_read(&_hwh_lock);
+        hwhi = __find_hw_handler_type(hwht->name);
+        if (!hwhi)
+                goto out;
+        if (--hwhi->use == 0)
+                module_put(hwhi->hwht.module);
+        if (hwhi->use < 0)
+                BUG();
+      out:
+        up_read(&_hwh_lock);
+}
+static struct hwh_internal *_alloc_hw_handler(struct hw_handler_type *hwht)
+{
+        struct hwh_internal *hwhi = kmalloc(sizeof(*hwhi), GFP_KERNEL);
+        if (hwhi) {
+                memset(hwhi, 0, sizeof(*hwhi));
+                hwhi->hwht = *hwht;
+        }
+        return hwhi;
+}
+int dm_register_hw_handler(struct hw_handler_type *hwht)
+{
+        int r = 0;
+        struct hwh_internal *hwhi = _alloc_hw_handler(hwht);
+        if (!hwhi)
+                return -ENOMEM;
+        down_write(&_hwh_lock);
+        if (__find_hw_handler_type(hwht->name)) {
+                kfree(hwhi);
+                r = -EEXIST;
+        } else
+                list_add(&hwhi->list, &_hw_handlers);
+        up_write(&_hwh_lock);
+        return r;
+}
+int dm_unregister_hw_handler(struct hw_handler_type *hwht)
+{
+        struct hwh_internal *hwhi;
+        down_write(&_hwh_lock);
+        hwhi = __find_hw_handler_type(hwht->name);
+        if (!hwhi) {
+                up_write(&_hwh_lock);
+                return -EINVAL;
+        }
+        if (hwhi->use) {
+                up_write(&_hwh_lock);
+                return -ETXTBSY;
+        }
+        list_del(&hwhi->list);
+        up_write(&_hwh_lock);
+        kfree(hwhi);
+        return 0;
+}
+unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio)
+{
+#if 0
+        int sense_key, asc, ascq;
+        if (bio->bi_error & BIO_SENSE) {
+                /* FIXME: This is just an initial guess. */
+                /* key / asc / ascq */
+                sense_key = (bio->bi_error >> 16) & 0xff;
+                asc = (bio->bi_error >> 8) & 0xff;
+                ascq = bio->bi_error & 0xff;
+                switch (sense_key) {
+                        /* This block as a whole comes from the device.
+                         * So no point retrying on another path. */
+                case 0x03:      /* Medium error */
+                case 0x05:      /* Illegal request */
+                case 0x07:      /* Data protect */
+                case 0x08:      /* Blank check */
+                case 0x0a:      /* copy aborted */
+                case 0x0c:      /* obsolete - no clue ;-) */
+                case 0x0d:      /* volume overflow */
+                case 0x0e:      /* data miscompare */
+                case 0x0f:      /* reserved - no idea either. */
+                        return MP_ERROR_IO;
+                        /* For these errors it's unclear whether they
+                         * come from the device or the controller.
+                         * So just lets try a different path, and if
+                         * it eventually succeeds, user-space will clear
+                         * the paths again... */
+                case 0x02:      /* Not ready */
+                case 0x04:      /* Hardware error */
+                case 0x09:      /* vendor specific */
+                case 0x0b:      /* Aborted command */
+                        return MP_FAIL_PATH;
+                case 0x06:      /* Unit attention - might want to decode */
+                        if (asc == 0x04 && ascq == 0x01)
+                                /* "Unit in the process of
+                                 * becoming ready" */
+                                return 0;
+                        return MP_FAIL_PATH;
+                        /* FIXME: For Unit Not Ready we may want
+                         * to have a generic pg activation
+                         * feature (START_UNIT). */
+                        /* Should these two ever end up in the
+                         * error path? I don't think so. */
+                case 0x00:      /* No sense */
+                case 0x01:      /* Recovered error */
+                        return 0;
+                }
+        }
+#endif
+        /* We got no idea how to decode the other kinds of errors ->
+         * assume generic error condition. */
+        return MP_FAIL_PATH;
+}
+EXPORT_SYMBOL_GPL(dm_register_hw_handler);
+EXPORT_SYMBOL_GPL(dm_unregister_hw_handler);
+EXPORT_SYMBOL_GPL(dm_scsi_err_handler);
diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h
new file mode 100644
index 000000000000..15f5629e231a
--- /dev/null
+++ b/drivers/md/dm-hw-handler.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Multipath hardware handler registration.
+ */
+#ifndef DM_HW_HANDLER_H
+#define DM_HW_HANDLER_H
+#include <linux/device-mapper.h>
+#include "dm-mpath.h"
+struct hw_handler_type;
+struct hw_handler {
+        struct hw_handler_type *type;
+        void *context;
+};
+/*
+ * Constructs a hardware handler object, takes custom arguments
+ */
+/* Information about a hardware handler type */
+struct hw_handler_type {
+        char *name;
+        struct module *module;
+        int (*create) (struct hw_handler *handler, unsigned int argc,
+                       char **argv);
+        void (*destroy) (struct hw_handler *hwh);
+        void (*pg_init) (struct hw_handler *hwh, unsigned bypassed,
+                         struct path *path);
+        unsigned (*error) (struct hw_handler *hwh, struct bio *bio);
+        int (*status) (struct hw_handler *hwh, status_type_t type,
+                       char *result, unsigned int maxlen);
+};
+/* Register a hardware handler */
+int dm_register_hw_handler(struct hw_handler_type *type);
+/* Unregister a hardware handler */
+int dm_unregister_hw_handler(struct hw_handler_type *type);
+/* Returns a registered hardware handler type */
+struct hw_handler_type *dm_get_hw_handler(const char *name);
+/* Releases a hardware handler  */
+void dm_put_hw_handler(struct hw_handler_type *hwht);
+/* Default err function */
+unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio);
+/* Error flags for err and dm_pg_init_complete */
+#define MP_FAIL_PATH 1
+#define MP_BYPASS_PG 2
+#define MP_ERROR_IO  4  /* Don't retry this I/O */
+#endif
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
new file mode 100644
index 000000000000..45754bb6a799
--- /dev/null
+++ b/drivers/md/dm-io.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the GPL.
+ */
+#include "dm-io.h"
+#include <linux/bio.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+static struct bio_set *_bios;
+/* FIXME: can we shrink this ? */
+struct io {
+        unsigned long error;
+        atomic_t count;
+        struct task_struct *sleeper;
+        io_notify_fn callback;
+        void *context;
+};
+/*
+ * io contexts are only dynamically allocated for asynchronous
+ * io.  Since async io is likely to be the majority of io we'll
+ * have the same number of io contexts as buffer heads ! (FIXME:
+ * must reduce this).
+ */
+static unsigned _num_ios;
+static mempool_t *_io_pool;
+static void *alloc_io(unsigned int __nocast gfp_mask, void *pool_data)
+{
+        return kmalloc(sizeof(struct io), gfp_mask);
+}
+static void free_io(void *element, void *pool_data)
+{
+        kfree(element);
+}
+static unsigned int pages_to_ios(unsigned int pages)
+{
+        return 4 * pages;       /* too many ? */
+}
+static int resize_pool(unsigned int new_ios)
+{
+        int r = 0;
+        if (_io_pool) {
+                if (new_ios == 0) {
+                        /* free off the pool */
+                        mempool_destroy(_io_pool);
+                        _io_pool = NULL;
+                        bioset_free(_bios);
+                } else {
+                        /* resize the pool */
+                        r = mempool_resize(_io_pool, new_ios, GFP_KERNEL);
+                }
+        } else {
+                /* create new pool */
+                _io_pool = mempool_create(new_ios, alloc_io, free_io, NULL);
+                if (!_io_pool)
+                        return -ENOMEM;
+                _bios = bioset_create(16, 16, 4);
+                if (!_bios) {
+                        mempool_destroy(_io_pool);
+                        _io_pool = NULL;
+                        return -ENOMEM;
+                }
+        }
+        if (!r)
+                _num_ios = new_ios;
+        return r;
+}
+int dm_io_get(unsigned int num_pages)
+{
+        return resize_pool(_num_ios + pages_to_ios(num_pages));
+}
+void dm_io_put(unsigned int num_pages)
+{
+        resize_pool(_num_ios - pages_to_ios(num_pages));
+}
+/*-----------------------------------------------------------------
+ * We need to keep track of which region a bio is doing io for.
+ * In order to save a memory allocation we store this the last
+ * bvec which we know is unused (blech).
+ * XXX This is ugly and can OOPS with some configs... find another way.
+ *---------------------------------------------------------------*/
+static inline void bio_set_region(struct bio *bio, unsigned region)
+{
+        bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len = region;
+}
+static inline unsigned bio_get_region(struct bio *bio)
+{
+        return bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len;
+}
+/*-----------------------------------------------------------------
+ * We need an io object to keep track of the number of bios that
+ * have been dispatched for a particular io.
+ *---------------------------------------------------------------*/
+static void dec_count(struct io *io, unsigned int region, int error)
+{
+        if (error)
+                set_bit(region, &io->error);
+        if (atomic_dec_and_test(&io->count)) {
+                if (io->sleeper)
+                        wake_up_process(io->sleeper);
+                else {
+                        int r = io->error;
+                        io_notify_fn fn = io->callback;
+                        void *context = io->context;
+                        mempool_free(io, _io_pool);
+                        fn(r, context);
+                }
+        }
+}
+static int endio(struct bio *bio, unsigned int done, int error)
+{
+        struct io *io = (struct io *) bio->bi_private;
+        /* keep going until we've finished */
+        if (bio->bi_size)
+                return 1;
+        if (error && bio_data_dir(bio) == READ)
+                zero_fill_bio(bio);
+        dec_count(io, bio_get_region(bio), error);
+        bio_put(bio);
+        return 0;
+}
+/*-----------------------------------------------------------------
+ * These little objects provide an abstraction for getting a new
+ * destination page for io.
+ *---------------------------------------------------------------*/
+struct dpages {
+        void (*get_page)(struct dpages *dp,
+                         struct page **p, unsigned long *len, unsigned *offset);
+        void (*next_page)(struct dpages *dp);
+        unsigned context_u;
+        void *context_ptr;
+};
+/*
+ * Functions for getting the pages from a list.
+ */
+static void list_get_page(struct dpages *dp,
+                  struct page **p, unsigned long *len, unsigned *offset)
+{
+        unsigned o = dp->context_u;
+        struct page_list *pl = (struct page_list *) dp->context_ptr;
+        *p = pl->page;
+        *len = PAGE_SIZE - o;
+        *offset = o;
+}
+static void list_next_page(struct dpages *dp)
+{
+        struct page_list *pl = (struct page_list *) dp->context_ptr;
+        dp->context_ptr = pl->next;
+        dp->context_u = 0;
+}
+static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset)
+{
+        dp->get_page = list_get_page;
+        dp->next_page = list_next_page;
+        dp->context_u = offset;
+        dp->context_ptr = pl;
+}
+/*
+ * Functions for getting the pages from a bvec.
+ */
+static void bvec_get_page(struct dpages *dp,
+                  struct page **p, unsigned long *len, unsigned *offset)
+{
+        struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
+        *p = bvec->bv_page;
+        *len = bvec->bv_len;
+        *offset = bvec->bv_offset;
+}
+static void bvec_next_page(struct dpages *dp)
+{
+        struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
+        dp->context_ptr = bvec + 1;
+}
+static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
+{
+        dp->get_page = bvec_get_page;
+        dp->next_page = bvec_next_page;
+        dp->context_ptr = bvec;
+}
+static void vm_get_page(struct dpages *dp,
+                 struct page **p, unsigned long *len, unsigned *offset)
+{
+        *p = vmalloc_to_page(dp->context_ptr);
+        *offset = dp->context_u;
+        *len = PAGE_SIZE - dp->context_u;
+}
+static void vm_next_page(struct dpages *dp)
+{
+        dp->context_ptr += PAGE_SIZE - dp->context_u;
+        dp->context_u = 0;
+}
+static void vm_dp_init(struct dpages *dp, void *data)
+{
+        dp->get_page = vm_get_page;
+        dp->next_page = vm_next_page;
+        dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
+        dp->context_ptr = data;
+}
+/*-----------------------------------------------------------------
+ * IO routines that accept a list of pages.
+ *---------------------------------------------------------------*/
+static void do_region(int rw, unsigned int region, struct io_region *where,
+                      struct dpages *dp, struct io *io)
+{
+        struct bio *bio;
+        struct page *page;
+        unsigned long len;
+        unsigned offset;
+        unsigned num_bvecs;
+        sector_t remaining = where->count;
+        while (remaining) {
+                /*
+                 * Allocate a suitably sized bio, we add an extra
+                 * bvec for bio_get/set_region().
+                 */
+                num_bvecs = (remaining / (PAGE_SIZE >> 9)) + 2;
+                bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios);
+                bio->bi_sector = where->sector + (where->count - remaining);
+                bio->bi_bdev = where->bdev;
+                bio->bi_end_io = endio;
+                bio->bi_private = io;
+                bio_set_region(bio, region);
+                /*
+                 * Try and add as many pages as possible.
+                 */
+                while (remaining) {
+                        dp->get_page(dp, &page, &len, &offset);
+                        len = min(len, to_bytes(remaining));
+                        if (!bio_add_page(bio, page, len, offset))
+                                break;
+                        offset = 0;
+                        remaining -= to_sector(len);
+                        dp->next_page(dp);
+                }
+                atomic_inc(&io->count);
+                submit_bio(rw, bio);
+        }
+}
+static void dispatch_io(int rw, unsigned int num_regions,
+                        struct io_region *where, struct dpages *dp,
+                        struct io *io, int sync)
+{
+        int i;
+        struct dpages old_pages = *dp;
+        if (sync)
+                rw |= (1 << BIO_RW_SYNC);
+        /*
+         * For multiple regions we need to be careful to rewind
+         * the dp object for each call to do_region.
+         */
+        for (i = 0; i < num_regions; i++) {
+                *dp = old_pages;
+                if (where[i].count)
+                        do_region(rw, i, where + i, dp, io);
+        }
+        /*
+         * Drop the extra refence that we were holding to avoid
+         * the io being completed too early.
+         */
+        dec_count(io, 0, 0);
+}
+static int sync_io(unsigned int num_regions, struct io_region *where,
+            int rw, struct dpages *dp, unsigned long *error_bits)
+{
+        struct io io;
+        if (num_regions > 1 && rw != WRITE) {
+                WARN_ON(1);
+                return -EIO;
+        }
+        io.error = 0;
+        atomic_set(&io.count, 1); /* see dispatch_io() */
+        io.sleeper = current;
+        dispatch_io(rw, num_regions, where, dp, &io, 1);
+        while (1) {
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                if (!atomic_read(&io.count) || signal_pending(current))
+                        break;
+                io_schedule();
+        }
+        set_current_state(TASK_RUNNING);
+        if (atomic_read(&io.count))
+                return -EINTR;
+        *error_bits = io.error;
+        return io.error ? -EIO : 0;
+}
+static int async_io(unsigned int num_regions, struct io_region *where, int rw,
+             struct dpages *dp, io_notify_fn fn, void *context)
+{
+        struct io *io;
+        if (num_regions > 1 && rw != WRITE) {
+                WARN_ON(1);
+                fn(1, context);
+                return -EIO;
+        }
+        io = mempool_alloc(_io_pool, GFP_NOIO);
+        io->error = 0;
+        atomic_set(&io->count, 1); /* see dispatch_io() */
+        io->sleeper = NULL;
+        io->callback = fn;
+        io->context = context;
+        dispatch_io(rw, num_regions, where, dp, io, 0);
+        return 0;
+}
+int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
+               struct page_list *pl, unsigned int offset,
+               unsigned long *error_bits)
+{
+        struct dpages dp;
+        list_dp_init(&dp, pl, offset);
+        return sync_io(num_regions, where, rw, &dp, error_bits);
+}
+int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
+                    struct bio_vec *bvec, unsigned long *error_bits)
+{
+        struct dpages dp;
+        bvec_dp_init(&dp, bvec);
+        return sync_io(num_regions, where, rw, &dp, error_bits);
+}
+int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
+                  void *data, unsigned long *error_bits)
+{
+        struct dpages dp;
+        vm_dp_init(&dp, data);
+        return sync_io(num_regions, where, rw, &dp, error_bits);
+}
+int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
+                struct page_list *pl, unsigned int offset,
+                io_notify_fn fn, void *context)
+{
+        struct dpages dp;
+        list_dp_init(&dp, pl, offset);
+        return async_io(num_regions, where, rw, &dp, fn, context);
+}
+int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
+                     struct bio_vec *bvec, io_notify_fn fn, void *context)
+{
+        struct dpages dp;
+        bvec_dp_init(&dp, bvec);
+        return async_io(num_regions, where, rw, &dp, fn, context);
+}
+int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
+                   void *data, io_notify_fn fn, void *context)
+{
+        struct dpages dp;
+        vm_dp_init(&dp, data);
+        return async_io(num_regions, where, rw, &dp, fn, context);
+}
+EXPORT_SYMBOL(dm_io_get);
+EXPORT_SYMBOL(dm_io_put);
+EXPORT_SYMBOL(dm_io_sync);
+EXPORT_SYMBOL(dm_io_async);
+EXPORT_SYMBOL(dm_io_sync_bvec);
+EXPORT_SYMBOL(dm_io_async_bvec);
+EXPORT_SYMBOL(dm_io_sync_vm);
+EXPORT_SYMBOL(dm_io_async_vm);
diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h
new file mode 100644
index 000000000000..1a77f3265706
--- /dev/null
+++ b/drivers/md/dm-io.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the GPL.
+ */
+#ifndef _DM_IO_H
+#define _DM_IO_H
+#include "dm.h"
+/* FIXME make this configurable */
+#define DM_MAX_IO_REGIONS 8
+struct io_region {
+        struct block_device *bdev;
+        sector_t sector;
+        sector_t count;
+};
+struct page_list {
+        struct page_list *next;
+        struct page *page;
+};
+/*
+ * 'error' is a bitset, with each bit indicating whether an error
+ * occurred doing io to the corresponding region.
+ */
+typedef void (*io_notify_fn)(unsigned long error, void *context);
+/*
+ * Before anyone uses the IO interface they should call
+ * dm_io_get(), specifying roughly how many pages they are
+ * expecting to perform io on concurrently.
+ *
+ * This function may block.
+ */
+int dm_io_get(unsigned int num_pages);
+void dm_io_put(unsigned int num_pages);
+/*
+ * Synchronous IO.
+ *
+ * Please ensure that the rw flag in the next two functions is
+ * either READ or WRITE, ie. we don't take READA.  Any
+ * regions with a zero count field will be ignored.
+ */
+int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
+               struct page_list *pl, unsigned int offset,
+               unsigned long *error_bits);
+int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
+                    struct bio_vec *bvec, unsigned long *error_bits);
+int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
+                  void *data, unsigned long *error_bits);
+/*
+ * Aynchronous IO.
+ *
+ * The 'where' array may be safely allocated on the stack since
+ * the function takes a copy.
+ */
+int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
+                struct page_list *pl, unsigned int offset,
+                io_notify_fn fn, void *context);
+int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
+                     struct bio_vec *bvec, io_notify_fn fn, void *context);
+int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
+                   void *data, io_notify_fn fn, void *context);
+#endif
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
new file mode 100644
index 000000000000..ee3c869d9701
--- /dev/null
+++ b/drivers/md/dm-ioctl.c
@@ -0,0 +1,1416 @@
+/*
+ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm.h"
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/dm-ioctl.h>
+#include <asm/uaccess.h>
+#define DM_DRIVER_EMAIL "dm-devel@redhat.com"
+/*-----------------------------------------------------------------
+ * The ioctl interface needs to be able to look up devices by
+ * name or uuid.
+ *---------------------------------------------------------------*/
+struct hash_cell {
+        struct list_head name_list;
+        struct list_head uuid_list;
+        char *name;
+        char *uuid;
+        struct mapped_device *md;
+        struct dm_table *new_map;
+};
+struct vers_iter {
+    size_t param_size;
+    struct dm_target_versions *vers, *old_vers;
+    char *end;
+    uint32_t flags;
+};
+#define NUM_BUCKETS 64
+#define MASK_BUCKETS (NUM_BUCKETS - 1)
+static struct list_head _name_buckets[NUM_BUCKETS];
+static struct list_head _uuid_buckets[NUM_BUCKETS];
+static void dm_hash_remove_all(void);
+/*
+ * Guards access to both hash tables.
+ */
+static DECLARE_RWSEM(_hash_lock);
+static void init_buckets(struct list_head *buckets)
+{
+        unsigned int i;
+        for (i = 0; i < NUM_BUCKETS; i++)
+                INIT_LIST_HEAD(buckets + i);
+}
+static int dm_hash_init(void)
+{
+        init_buckets(_name_buckets);
+        init_buckets(_uuid_buckets);
+        devfs_mk_dir(DM_DIR);
+        return 0;
+}
+static void dm_hash_exit(void)
+{
+        dm_hash_remove_all();
+        devfs_remove(DM_DIR);
+}
+/*-----------------------------------------------------------------
+ * Hash function:
+ * We're not really concerned with the str hash function being
+ * fast since it's only used by the ioctl interface.
+ *---------------------------------------------------------------*/
+static unsigned int hash_str(const char *str)
+{
+        const unsigned int hash_mult = 2654435387U;
+        unsigned int h = 0;
+        while (*str)
+                h = (h + (unsigned int) *str++) * hash_mult;
+        return h & MASK_BUCKETS;
+}
+/*-----------------------------------------------------------------
+ * Code for looking up a device by name
+ *---------------------------------------------------------------*/
+static struct hash_cell *__get_name_cell(const char *str)
+{
+        struct hash_cell *hc;
+        unsigned int h = hash_str(str);
+        list_for_each_entry (hc, _name_buckets + h, name_list)
+                if (!strcmp(hc->name, str))
+                        return hc;
+        return NULL;
+}
+static struct hash_cell *__get_uuid_cell(const char *str)
+{
+        struct hash_cell *hc;
+        unsigned int h = hash_str(str);
+        list_for_each_entry (hc, _uuid_buckets + h, uuid_list)
+                if (!strcmp(hc->uuid, str))
+                        return hc;
+        return NULL;
+}
+/*-----------------------------------------------------------------
+ * Inserting, removing and renaming a device.
+ *---------------------------------------------------------------*/
+static inline char *kstrdup(const char *str)
+{
+        char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
+        if (r)
+                strcpy(r, str);
+        return r;
+}
+static struct hash_cell *alloc_cell(const char *name, const char *uuid,
+                                    struct mapped_device *md)
+{
+        struct hash_cell *hc;
+        hc = kmalloc(sizeof(*hc), GFP_KERNEL);
+        if (!hc)
+                return NULL;
+        hc->name = kstrdup(name);
+        if (!hc->name) {
+                kfree(hc);
+                return NULL;
+        }
+        if (!uuid)
+                hc->uuid = NULL;
+        else {
+                hc->uuid = kstrdup(uuid);
+                if (!hc->uuid) {
+                        kfree(hc->name);
+                        kfree(hc);
+                        return NULL;
+                }
+        }
+        INIT_LIST_HEAD(&hc->name_list);
+        INIT_LIST_HEAD(&hc->uuid_list);
+        hc->md = md;
+        hc->new_map = NULL;
+        return hc;
+}
+static void free_cell(struct hash_cell *hc)
+{
+        if (hc) {
+                kfree(hc->name);
+                kfree(hc->uuid);
+                kfree(hc);
+        }
+}
+/*
+ * devfs stuff.
+ */
+static int register_with_devfs(struct hash_cell *hc)
+{
+        struct gendisk *disk = dm_disk(hc->md);
+        devfs_mk_bdev(MKDEV(disk->major, disk->first_minor),
+                      S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
+                      DM_DIR "/%s", hc->name);
+        return 0;
+}
+static int unregister_with_devfs(struct hash_cell *hc)
+{
+        devfs_remove(DM_DIR"/%s", hc->name);
+        return 0;
+}
+/*
+ * The kdev_t and uuid of a device can never change once it is
+ * initially inserted.
+ */
+static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
+{
+        struct hash_cell *cell;
+        /*
+         * Allocate the new cells.
+         */
+        cell = alloc_cell(name, uuid, md);
+        if (!cell)
+                return -ENOMEM;
+        /*
+         * Insert the cell into both hash tables.
+         */
+        down_write(&_hash_lock);
+        if (__get_name_cell(name))
+                goto bad;
+        list_add(&cell->name_list, _name_buckets + hash_str(name));
+        if (uuid) {
+                if (__get_uuid_cell(uuid)) {
+                        list_del(&cell->name_list);
+                        goto bad;
+                }
+                list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
+        }
+        register_with_devfs(cell);
+        dm_get(md);
+        dm_set_mdptr(md, cell);
+        up_write(&_hash_lock);
+        return 0;
+ bad:
+        up_write(&_hash_lock);
+        free_cell(cell);
+        return -EBUSY;
+}
+static void __hash_remove(struct hash_cell *hc)
+{
+        /* remove from the dev hash */
+        list_del(&hc->uuid_list);
+        list_del(&hc->name_list);
+        unregister_with_devfs(hc);
+        dm_set_mdptr(hc->md, NULL);
+        dm_put(hc->md);
+        if (hc->new_map)
+                dm_table_put(hc->new_map);
+        free_cell(hc);
+}
+static void dm_hash_remove_all(void)
+{
+        int i;
+        struct hash_cell *hc;
+        struct list_head *tmp, *n;
+        down_write(&_hash_lock);
+        for (i = 0; i < NUM_BUCKETS; i++) {
+                list_for_each_safe (tmp, n, _name_buckets + i) {
+                        hc = list_entry(tmp, struct hash_cell, name_list);
+                        __hash_remove(hc);
+                }
+        }
+        up_write(&_hash_lock);
+}
+static int dm_hash_rename(const char *old, const char *new)
+{
+        char *new_name, *old_name;
+        struct hash_cell *hc;
+        /*
+         * duplicate new.
+         */
+        new_name = kstrdup(new);
+        if (!new_name)
+                return -ENOMEM;
+        down_write(&_hash_lock);
+        /*
+         * Is new free ?
+         */
+        hc = __get_name_cell(new);
+        if (hc) {
+                DMWARN("asked to rename to an already existing name %s -> %s",
+                       old, new);
+                up_write(&_hash_lock);
+                kfree(new_name);
+                return -EBUSY;
+        }
+        /*
+         * Is there such a device as 'old' ?
+         */
+        hc = __get_name_cell(old);
+        if (!hc) {
+                DMWARN("asked to rename a non existent device %s -> %s",
+                       old, new);
+                up_write(&_hash_lock);
+                kfree(new_name);
+                return -ENXIO;
+        }
+        /*
+         * rename and move the name cell.
+         */
+        unregister_with_devfs(hc);
+        list_del(&hc->name_list);
+        old_name = hc->name;
+        hc->name = new_name;
+        list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+        /* rename the device node in devfs */
+        register_with_devfs(hc);
+        up_write(&_hash_lock);
+        kfree(old_name);
+        return 0;
+}
+/*-----------------------------------------------------------------
+ * Implementation of the ioctl commands
+ *---------------------------------------------------------------*/
+/*
+ * All the ioctl commands get dispatched to functions with this
+ * prototype.
+ */
+typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
+static int remove_all(struct dm_ioctl *param, size_t param_size)
+{
+        dm_hash_remove_all();
+        param->data_size = 0;
+        return 0;
+}
+/*
+ * Round up the ptr to an 8-byte boundary.
+ */
+#define ALIGN_MASK 7
+static inline void *align_ptr(void *ptr)
+{
+        return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK);
+}
+/*
+ * Retrieves the data payload buffer from an already allocated
+ * struct dm_ioctl.
+ */
+static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
+                               size_t *len)
+{
+        param->data_start = align_ptr(param + 1) - (void *) param;
+        if (param->data_start < param_size)
+                *len = param_size - param->data_start;
+        else
+                *len = 0;
+        return ((void *) param) + param->data_start;
+}
+static int list_devices(struct dm_ioctl *param, size_t param_size)
+{
+        unsigned int i;
+        struct hash_cell *hc;
+        size_t len, needed = 0;
+        struct gendisk *disk;
+        struct dm_name_list *nl, *old_nl = NULL;
+        down_write(&_hash_lock);
+        /*
+         * Loop through all the devices working out how much
+         * space we need.
+         */
+        for (i = 0; i < NUM_BUCKETS; i++) {
+                list_for_each_entry (hc, _name_buckets + i, name_list) {
+                        needed += sizeof(struct dm_name_list);
+                        needed += strlen(hc->name) + 1;
+                        needed += ALIGN_MASK;
+                }
+        }
+        /*
+         * Grab our output buffer.
+         */
+        nl = get_result_buffer(param, param_size, &len);
+        if (len < needed) {
+                param->flags |= DM_BUFFER_FULL_FLAG;
+                goto out;
+        }
+        param->data_size = param->data_start + needed;
+        nl->dev = 0;    /* Flags no data */
+        /*
+         * Now loop through filling out the names.
+         */
+        for (i = 0; i < NUM_BUCKETS; i++) {
+                list_for_each_entry (hc, _name_buckets + i, name_list) {
+                        if (old_nl)
+                                old_nl->next = (uint32_t) ((void *) nl -
+                                                           (void *) old_nl);
+                        disk = dm_disk(hc->md);
+                        nl->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
+                        nl->next = 0;
+                        strcpy(nl->name, hc->name);
+                        old_nl = nl;
+                        nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
+                }
+        }
+ out:
+        up_write(&_hash_lock);
+        return 0;
+}
+static void list_version_get_needed(struct target_type *tt, void *needed_param)
+{
+    size_t *needed = needed_param;
+    *needed += strlen(tt->name);
+    *needed += sizeof(tt->version);
+    *needed += ALIGN_MASK;
+}
+static void list_version_get_info(struct target_type *tt, void *param)
+{
+    struct vers_iter *info = param;
+    /* Check space - it might have changed since the first iteration */
+    if ((char *)info->vers + sizeof(tt->version) + strlen(tt->name) + 1 >
+        info->end) {
+        info->flags = DM_BUFFER_FULL_FLAG;
+        return;
+    }
+    if (info->old_vers)
+        info->old_vers->next = (uint32_t) ((void *)info->vers -
+                                           (void *)info->old_vers);
+    info->vers->version[0] = tt->version[0];
+    info->vers->version[1] = tt->version[1];
+    info->vers->version[2] = tt->version[2];
+    info->vers->next = 0;
+    strcpy(info->vers->name, tt->name);
+    info->old_vers = info->vers;
+    info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1);
+}
+static int list_versions(struct dm_ioctl *param, size_t param_size)
+{
+        size_t len, needed = 0;
+        struct dm_target_versions *vers;
+        struct vers_iter iter_info;
+        /*
+         * Loop through all the devices working out how much
+         * space we need.
+         */
+        dm_target_iterate(list_version_get_needed, &needed);
+        /*
+         * Grab our output buffer.
+         */
+        vers = get_result_buffer(param, param_size, &len);
+        if (len < needed) {
+                param->flags |= DM_BUFFER_FULL_FLAG;
+                goto out;
+        }
+        param->data_size = param->data_start + needed;
+        iter_info.param_size = param_size;
+        iter_info.old_vers = NULL;
+        iter_info.vers = vers;
+        iter_info.flags = 0;
+        iter_info.end = (char *)vers+len;
+        /*
+         * Now loop through filling out the names & versions.
+         */
+        dm_target_iterate(list_version_get_info, &iter_info);
+        param->flags |= iter_info.flags;
+ out:
+        return 0;
+}
+static int check_name(const char *name)
+{
+        if (strchr(name, '/')) {
+                DMWARN("invalid device name");
+                return -EINVAL;
+        }
+        return 0;
+}
+/*
+ * Fills in a dm_ioctl structure, ready for sending back to
+ * userland.
+ */
+static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
+{
+        struct gendisk *disk = dm_disk(md);
+        struct dm_table *table;
+        struct block_device *bdev;
+        param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
+                          DM_ACTIVE_PRESENT_FLAG);
+        if (dm_suspended(md))
+                param->flags |= DM_SUSPEND_FLAG;
+        param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
+        if (!(param->flags & DM_SKIP_BDGET_FLAG)) {
+                bdev = bdget_disk(disk, 0);
+                if (!bdev)
+                        return -ENXIO;
+                /*
+                 * Yes, this will be out of date by the time it gets back
+                 * to userland, but it is still very useful for
+                 * debugging.
+                 */
+                param->open_count = bdev->bd_openers;
+                bdput(bdev);
+        } else
+                param->open_count = -1;
+        if (disk->policy)
+                param->flags |= DM_READONLY_FLAG;
+        param->event_nr = dm_get_event_nr(md);
+        table = dm_get_table(md);
+        if (table) {
+                param->flags |= DM_ACTIVE_PRESENT_FLAG;
+                param->target_count = dm_table_get_num_targets(table);
+                dm_table_put(table);
+        } else
+                param->target_count = 0;
+        return 0;
+}
+static int dev_create(struct dm_ioctl *param, size_t param_size)
+{
+        int r;
+        struct mapped_device *md;
+        r = check_name(param->name);
+        if (r)
+                return r;
+        if (param->flags & DM_PERSISTENT_DEV_FLAG)
+                r = dm_create_with_minor(MINOR(huge_decode_dev(param->dev)), &md);
+        else
+                r = dm_create(&md);
+        if (r)
+                return r;
+        r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
+        if (r) {
+                dm_put(md);
+                return r;
+        }
+        param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+        r = __dev_status(md, param);
+        dm_put(md);
+        return r;
+}
+/*
+ * Always use UUID for lookups if it's present, otherwise use name or dev.
+ */
+static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
+{
+        if (*param->uuid)
+                return __get_uuid_cell(param->uuid);
+        else if (*param->name)
+                return __get_name_cell(param->name);
+        else
+                return dm_get_mdptr(huge_decode_dev(param->dev));
+}
+static inline struct mapped_device *find_device(struct dm_ioctl *param)
+{
+        struct hash_cell *hc;
+        struct mapped_device *md = NULL;
+        down_read(&_hash_lock);
+        hc = __find_device_hash_cell(param);
+        if (hc) {
+                md = hc->md;
+                dm_get(md);
+                /*
+                 * Sneakily write in both the name and the uuid
+                 * while we have the cell.
+                 */
+                strncpy(param->name, hc->name, sizeof(param->name));
+                if (hc->uuid)
+                        strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1);
+                else
+                        param->uuid[0] = '\0';
+                if (hc->new_map)
+                        param->flags |= DM_INACTIVE_PRESENT_FLAG;
+                else
+                        param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+        }
+        up_read(&_hash_lock);
+        return md;
+}
+static int dev_remove(struct dm_ioctl *param, size_t param_size)
+{
+        struct hash_cell *hc;
+        down_write(&_hash_lock);
+        hc = __find_device_hash_cell(param);
+        if (!hc) {
+                DMWARN("device doesn't appear to be in the dev hash table.");
+                up_write(&_hash_lock);
+                return -ENXIO;
+        }
+        __hash_remove(hc);
+        up_write(&_hash_lock);
+        param->data_size = 0;
+        return 0;
+}
+/*
+ * Check a string doesn't overrun the chunk of
+ * memory we copied from userland.
+ */
+static int invalid_str(char *str, void *end)
+{
+        while ((void *) str < end)
+                if (!*str++)
+                        return 0;
+        return -EINVAL;
+}
+static int dev_rename(struct dm_ioctl *param, size_t param_size)
+{
+        int r;
+        char *new_name = (char *) param + param->data_start;
+        if (new_name < (char *) (param + 1) ||
+            invalid_str(new_name, (void *) param + param_size)) {
+                DMWARN("Invalid new logical volume name supplied.");
+                return -EINVAL;
+        }
+        r = check_name(new_name);
+        if (r)
+                return r;
+        param->data_size = 0;
+        return dm_hash_rename(param->name, new_name);
+}
+static int do_suspend(struct dm_ioctl *param)
+{
+        int r = 0;
+        struct mapped_device *md;
+        md = find_device(param);
+        if (!md)
+                return -ENXIO;
+        if (!dm_suspended(md))
+                r = dm_suspend(md);
+        if (!r)
+                r = __dev_status(md, param);
+        dm_put(md);
+        return r;
+}
+static int do_resume(struct dm_ioctl *param)
+{
+        int r = 0;
+        struct hash_cell *hc;
+        struct mapped_device *md;
+        struct dm_table *new_map;
+        down_write(&_hash_lock);
+        hc = __find_device_hash_cell(param);
+        if (!hc) {
+                DMWARN("device doesn't appear to be in the dev hash table.");
+                up_write(&_hash_lock);
+                return -ENXIO;
+        }
+        md = hc->md;
+        dm_get(md);
+        new_map = hc->new_map;
+        hc->new_map = NULL;
+        param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+        up_write(&_hash_lock);
+        /* Do we need to load a new map ? */
+        if (new_map) {
+                /* Suspend if it isn't already suspended */
+                if (!dm_suspended(md))
+                        dm_suspend(md);
+                r = dm_swap_table(md, new_map);
+                if (r) {
+                        dm_put(md);
+                        dm_table_put(new_map);
+                        return r;
+                }
+                if (dm_table_get_mode(new_map) & FMODE_WRITE)
+                        set_disk_ro(dm_disk(md), 0);
+                else
+                        set_disk_ro(dm_disk(md), 1);
+                dm_table_put(new_map);
+        }
+        if (dm_suspended(md))
+                r = dm_resume(md);
+        if (!r)
+                r = __dev_status(md, param);
+        dm_put(md);
+        return r;
+}
+/*
+ * Set or unset the suspension state of a device.
+ * If the device already is in the requested state we just return its status.
+ */
+static int dev_suspend(struct dm_ioctl *param, size_t param_size)
+{
+        if (param->flags & DM_SUSPEND_FLAG)
+                return do_suspend(param);
+        return do_resume(param);
+}
+/*
+ * Copies device info back to user space, used by
+ * the create and info ioctls.
+ */
+static int dev_status(struct dm_ioctl *param, size_t param_size)
+{
+        int r;
+        struct mapped_device *md;
+        md = find_device(param);
+        if (!md)
+                return -ENXIO;
+        r = __dev_status(md, param);
+        dm_put(md);
+        return r;
+}
+/*
+ * Build up the status struct for each target
+ */
+static void retrieve_status(struct dm_table *table,
+                            struct dm_ioctl *param, size_t param_size)
+{
+        unsigned int i, num_targets;
+        struct dm_target_spec *spec;
+        char *outbuf, *outptr;
+        status_type_t type;
+        size_t remaining, len, used = 0;
+        outptr = outbuf = get_result_buffer(param, param_size, &len);
+        if (param->flags & DM_STATUS_TABLE_FLAG)
+                type = STATUSTYPE_TABLE;
+        else
+                type = STATUSTYPE_INFO;
+        /* Get all the target info */
+        num_targets = dm_table_get_num_targets(table);
+        for (i = 0; i < num_targets; i++) {
+                struct dm_target *ti = dm_table_get_target(table, i);
+                remaining = len - (outptr - outbuf);
+                if (remaining <= sizeof(struct dm_target_spec)) {
+                        param->flags |= DM_BUFFER_FULL_FLAG;
+                        break;
+                }
+                spec = (struct dm_target_spec *) outptr;
+                spec->status = 0;
+                spec->sector_start = ti->begin;
+                spec->length = ti->len;
+                strncpy(spec->target_type, ti->type->name,
+                        sizeof(spec->target_type));
+                outptr += sizeof(struct dm_target_spec);
+                remaining = len - (outptr - outbuf);
+                if (remaining <= 0) {
+                        param->flags |= DM_BUFFER_FULL_FLAG;
+                        break;
+                }
+                /* Get the status/table string from the target driver */
+                if (ti->type->status) {
+                        if (ti->type->status(ti, type, outptr, remaining)) {
+                                param->flags |= DM_BUFFER_FULL_FLAG;
+                                break;
+                        }
+                } else
+                        outptr[0] = '\0';
+                outptr += strlen(outptr) + 1;
+                used = param->data_start + (outptr - outbuf);
+                outptr = align_ptr(outptr);
+                spec->next = outptr - outbuf;
+        }
+        if (used)
+                param->data_size = used;
+        param->target_count = num_targets;
+}
+/*
+ * Wait for a device to report an event
+ */
+static int dev_wait(struct dm_ioctl *param, size_t param_size)
+{
+        int r;
+        struct mapped_device *md;
+        struct dm_table *table;
+        md = find_device(param);
+        if (!md)
+                return -ENXIO;
+        /*
+         * Wait for a notification event
+         */
+        if (dm_wait_event(md, param->event_nr)) {
+                r = -ERESTARTSYS;
+                goto out;
+        }
+        /*
+         * The userland program is going to want to know what
+         * changed to trigger the event, so we may as well tell
+         * him and save an ioctl.
+         */
+        r = __dev_status(md, param);
+        if (r)
+                goto out;
+        table = dm_get_table(md);
+        if (table) {
+                retrieve_status(table, param, param_size);
+                dm_table_put(table);
+        }
+ out:
+        dm_put(md);
+        return r;
+}
+static inline int get_mode(struct dm_ioctl *param)
+{
+        int mode = FMODE_READ | FMODE_WRITE;
+        if (param->flags & DM_READONLY_FLAG)
+                mode = FMODE_READ;
+        return mode;
+}
+static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
+                       struct dm_target_spec **spec, char **target_params)
+{
+        *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
+        *target_params = (char *) (*spec + 1);
+        if (*spec < (last + 1))
+                return -EINVAL;
+        return invalid_str(*target_params, end);
+}
+static int populate_table(struct dm_table *table,
+                          struct dm_ioctl *param, size_t param_size)
+{
+        int r;
+        unsigned int i = 0;
+        struct dm_target_spec *spec = (struct dm_target_spec *) param;
+        uint32_t next = param->data_start;
+        void *end = (void *) param + param_size;
+        char *target_params;
+        if (!param->target_count) {
+                DMWARN("populate_table: no targets specified");
+                return -EINVAL;
+        }
+        for (i = 0; i < param->target_count; i++) {
+                r = next_target(spec, next, end, &spec, &target_params);
+                if (r) {
+                        DMWARN("unable to find target");
+                        return r;
+                }
+                r = dm_table_add_target(table, spec->target_type,
+                                        (sector_t) spec->sector_start,
+                                        (sector_t) spec->length,
+                                        target_params);
+                if (r) {
+                        DMWARN("error adding target to table");
+                        return r;
+                }
+                next = spec->next;
+        }
+        return dm_table_complete(table);
+}
+static int table_load(struct dm_ioctl *param, size_t param_size)
+{
+        int r;
+        struct hash_cell *hc;
+        struct dm_table *t;
+        r = dm_table_create(&t, get_mode(param), param->target_count);
+        if (r)
+                return r;
+        r = populate_table(t, param, param_size);
+        if (r) {
+                dm_table_put(t);
+                return r;
+        }
+        down_write(&_hash_lock);
+        hc = __find_device_hash_cell(param);
+        if (!hc) {
+                DMWARN("device doesn't appear to be in the dev hash table.");
+                up_write(&_hash_lock);
+                return -ENXIO;
+        }
+        if (hc->new_map)
+                dm_table_put(hc->new_map);
+        hc->new_map = t;
+        param->flags |= DM_INACTIVE_PRESENT_FLAG;
+        r = __dev_status(hc->md, param);
+        up_write(&_hash_lock);
+        return r;
+}
+static int table_clear(struct dm_ioctl *param, size_t param_size)
+{
+        int r;
+        struct hash_cell *hc;
+        down_write(&_hash_lock);
+        hc = __find_device_hash_cell(param);
+        if (!hc) {
+                DMWARN("device doesn't appear to be in the dev hash table.");
+                up_write(&_hash_lock);
+                return -ENXIO;
+        }
+        if (hc->new_map) {
+                dm_table_put(hc->new_map);
+                hc->new_map = NULL;
+        }
+        param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+        r = __dev_status(hc->md, param);
+        up_write(&_hash_lock);
+        return r;
+}
+/*
+ * Retrieves a list of devices used by a particular dm device.
+ */
+static void retrieve_deps(struct dm_table *table,
+                          struct dm_ioctl *param, size_t param_size)
+{
+        unsigned int count = 0;
+        struct list_head *tmp;
+        size_t len, needed;
+        struct dm_dev *dd;
+        struct dm_target_deps *deps;
+        deps = get_result_buffer(param, param_size, &len);
+        /*
+         * Count the devices.
+         */
+        list_for_each (tmp, dm_table_get_devices(table))
+                count++;
+        /*
+         * Check we have enough space.
+         */
+        needed = sizeof(*deps) + (sizeof(*deps->dev) * count);
+        if (len < needed) {
+                param->flags |= DM_BUFFER_FULL_FLAG;
+                return;
+        }
+        /*
+         * Fill in the devices.
+         */
+        deps->count = count;
+        count = 0;
+        list_for_each_entry (dd, dm_table_get_devices(table), list)
+                deps->dev[count++] = huge_encode_dev(dd->bdev->bd_dev);
+        param->data_size = param->data_start + needed;
+}
+static int table_deps(struct dm_ioctl *param, size_t param_size)
+{
+        int r = 0;
+        struct mapped_device *md;
+        struct dm_table *table;
+        md = find_device(param);
+        if (!md)
+                return -ENXIO;
+        r = __dev_status(md, param);
+        if (r)
+                goto out;
+        table = dm_get_table(md);
+        if (table) {
+                retrieve_deps(table, param, param_size);
+                dm_table_put(table);
+        }
+ out:
+        dm_put(md);
+        return r;
+}
+/*
+ * Return the status of a device as a text string for each
+ * target.
+ */
+static int table_status(struct dm_ioctl *param, size_t param_size)
+{
+        int r;
+        struct mapped_device *md;
+        struct dm_table *table;
+        md = find_device(param);
+        if (!md)
+                return -ENXIO;
+        r = __dev_status(md, param);
+        if (r)
+                goto out;
+        table = dm_get_table(md);
+        if (table) {
+                retrieve_status(table, param, param_size);
+                dm_table_put(table);
+        }
+ out:
+        dm_put(md);
+        return r;
+}
+/*
+ * Pass a message to the target that's at the supplied device offset.
+ */
+static int target_message(struct dm_ioctl *param, size_t param_size)
+{
+        int r, argc;
+        char **argv;
+        struct mapped_device *md;
+        struct dm_table *table;
+        struct dm_target *ti;
+        struct dm_target_msg *tmsg = (void *) param + param->data_start;
+        md = find_device(param);
+        if (!md)
+                return -ENXIO;
+        r = __dev_status(md, param);
+        if (r)
+                goto out;
+        if (tmsg < (struct dm_target_msg *) (param + 1) ||
+            invalid_str(tmsg->message, (void *) param + param_size)) {
+                DMWARN("Invalid target message parameters.");
+                r = -EINVAL;
+                goto out;
+        }
+        r = dm_split_args(&argc, &argv, tmsg->message);
+        if (r) {
+                DMWARN("Failed to split target message parameters");
+                goto out;
+        }
+        table = dm_get_table(md);
+        if (!table)
+                goto out_argv;
+        if (tmsg->sector >= dm_table_get_size(table)) {
+                DMWARN("Target message sector outside device.");
+                r = -EINVAL;
+                goto out_table;
+        }
+        ti = dm_table_find_target(table, tmsg->sector);
+        if (ti->type->message)
+                r = ti->type->message(ti, argc, argv);
+        else {
+                DMWARN("Target type does not support messages");
+                r = -EINVAL;
+        }
+ out_table:
+        dm_table_put(table);
+ out_argv:
+        kfree(argv);
+ out:
+        param->data_size = 0;
+        dm_put(md);
+        return r;
+}
+/*-----------------------------------------------------------------
+ * Implementation of open/close/ioctl on the special char
+ * device.
+ *---------------------------------------------------------------*/
+static ioctl_fn lookup_ioctl(unsigned int cmd)
+{
+        static struct {
+                int cmd;
+                ioctl_fn fn;
+        } _ioctls[] = {
+                {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */
+                {DM_REMOVE_ALL_CMD, remove_all},
+                {DM_LIST_DEVICES_CMD, list_devices},
+                {DM_DEV_CREATE_CMD, dev_create},
+                {DM_DEV_REMOVE_CMD, dev_remove},
+                {DM_DEV_RENAME_CMD, dev_rename},
+                {DM_DEV_SUSPEND_CMD, dev_suspend},
+                {DM_DEV_STATUS_CMD, dev_status},
+                {DM_DEV_WAIT_CMD, dev_wait},
+                {DM_TABLE_LOAD_CMD, table_load},
+                {DM_TABLE_CLEAR_CMD, table_clear},
+                {DM_TABLE_DEPS_CMD, table_deps},
+                {DM_TABLE_STATUS_CMD, table_status},
+                {DM_LIST_VERSIONS_CMD, list_versions},
+                {DM_TARGET_MSG_CMD, target_message}
+        };
+        return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
+}
+/*
+ * As well as checking the version compatibility this always
+ * copies the kernel interface version out.
+ */
+static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
+{
+        uint32_t version[3];
+        int r = 0;
+        if (copy_from_user(version, user->version, sizeof(version)))
+                return -EFAULT;
+        if ((DM_VERSION_MAJOR != version[0]) ||
+            (DM_VERSION_MINOR < version[1])) {
+                DMWARN("ioctl interface mismatch: "
+                       "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
+                       DM_VERSION_MAJOR, DM_VERSION_MINOR,
+                       DM_VERSION_PATCHLEVEL,
+                       version[0], version[1], version[2], cmd);
+                r = -EINVAL;
+        }
+        /*
+         * Fill in the kernel version.
+         */
+        version[0] = DM_VERSION_MAJOR;
+        version[1] = DM_VERSION_MINOR;
+        version[2] = DM_VERSION_PATCHLEVEL;
+        if (copy_to_user(user->version, version, sizeof(version)))
+                return -EFAULT;
+        return r;
+}
+static void free_params(struct dm_ioctl *param)
+{
+        vfree(param);
+}
+static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
+{
+        struct dm_ioctl tmp, *dmi;
+        if (copy_from_user(&tmp, user, sizeof(tmp)))
+                return -EFAULT;
+        if (tmp.data_size < sizeof(tmp))
+                return -EINVAL;
+        dmi = (struct dm_ioctl *) vmalloc(tmp.data_size);
+        if (!dmi)
+                return -ENOMEM;
+        if (copy_from_user(dmi, user, tmp.data_size)) {
+                vfree(dmi);
+                return -EFAULT;
+        }
+        *param = dmi;
+        return 0;
+}
+static int validate_params(uint cmd, struct dm_ioctl *param)
+{
+        /* Always clear this flag */
+        param->flags &= ~DM_BUFFER_FULL_FLAG;
+        /* Ignores parameters */
+        if (cmd == DM_REMOVE_ALL_CMD ||
+            cmd == DM_LIST_DEVICES_CMD ||
+            cmd == DM_LIST_VERSIONS_CMD)
+                return 0;
+        if ((cmd == DM_DEV_CREATE_CMD)) {
+                if (!*param->name) {
+                        DMWARN("name not supplied when creating device");
+                        return -EINVAL;
+                }
+        } else if ((*param->uuid && *param->name)) {
+                DMWARN("only supply one of name or uuid, cmd(%u)", cmd);
+                return -EINVAL;
+        }
+        /* Ensure strings are terminated */
+        param->name[DM_NAME_LEN - 1] = '\0';
+        param->uuid[DM_UUID_LEN - 1] = '\0';
+        return 0;
+}
+static int ctl_ioctl(struct inode *inode, struct file *file,
+                     uint command, ulong u)
+{
+        int r = 0;
+        unsigned int cmd;
+        struct dm_ioctl *param;
+        struct dm_ioctl __user *user = (struct dm_ioctl __user *) u;
+        ioctl_fn fn = NULL;
+        size_t param_size;
+        /* only root can play with this */
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        if (_IOC_TYPE(command) != DM_IOCTL)
+                return -ENOTTY;
+        cmd = _IOC_NR(command);
+        /*
+         * Check the interface version passed in.  This also
+         * writes out the kernel's interface version.
+         */
+        r = check_version(cmd, user);
+        if (r)
+                return r;
+        /*
+         * Nothing more to do for the version command.
+         */
+        if (cmd == DM_VERSION_CMD)
+                return 0;
+        fn = lookup_ioctl(cmd);
+        if (!fn) {
+                DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
+                return -ENOTTY;
+        }
+        /*
+         * Trying to avoid low memory issues when a device is
+         * suspended.
+         */
+        current->flags |= PF_MEMALLOC;
+        /*
+         * Copy the parameters into kernel space.
+         */
+        r = copy_params(user, &param);
+        if (r) {
+                current->flags &= ~PF_MEMALLOC;
+                return r;
+        }
+        /*
+         * FIXME: eventually we will remove the PF_MEMALLOC flag
+         * here.  However the tools still do nasty things like
+         * 'load' while a device is suspended.
+         */
+        r = validate_params(cmd, param);
+        if (r)
+                goto out;
+        param_size = param->data_size;
+        param->data_size = sizeof(*param);
+        r = fn(param, param_size);
+        /*
+         * Copy the results back to userland.
+         */
+        if (!r && copy_to_user(user, param, param->data_size))
+                r = -EFAULT;
+ out:
+        free_params(param);
+        current->flags &= ~PF_MEMALLOC;
+        return r;
+}
+static struct file_operations _ctl_fops = {
+        .ioctl   = ctl_ioctl,
+        .owner   = THIS_MODULE,
+};
+static struct miscdevice _dm_misc = {
+        .minor          = MISC_DYNAMIC_MINOR,
+        .name           = DM_NAME,
+        .devfs_name     = "mapper/control",
+        .fops           = &_ctl_fops
+};
+/*
+ * Create misc character device and link to DM_DIR/control.
+ */
+int __init dm_interface_init(void)
+{
+        int r;
+        r = dm_hash_init();
+        if (r)
+                return r;
+        r = misc_register(&_dm_misc);
+        if (r) {
+                DMERR("misc_register failed for control device");
+                dm_hash_exit();
+                return r;
+        }
+        DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
+               DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
+               DM_DRIVER_EMAIL);
+        return 0;
+}
+void dm_interface_exit(void)
+{
+        if (misc_deregister(&_dm_misc) < 0)
+                DMERR("misc_deregister failed for control device");
+        dm_hash_exit();
+}
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
new file mode 100644
index 000000000000..6a2cd5dc8a63
--- /dev/null
+++ b/drivers/md/dm-linear.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2001-2003 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm.h"
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+/*
+ * Linear: maps a linear range of a device.
+ */
+struct linear_c {
+        struct dm_dev *dev;
+        sector_t start;
+};
+/*
+ * Construct a linear mapping: <dev_path> <offset>
+ */
+static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+        struct linear_c *lc;
+        if (argc != 2) {
+                ti->error = "dm-linear: Invalid argument count";
+                return -EINVAL;
+        }
+        lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+        if (lc == NULL) {
+                ti->error = "dm-linear: Cannot allocate linear context";
+                return -ENOMEM;
+        }
+        if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) {
+                ti->error = "dm-linear: Invalid device sector";
+                goto bad;
+        }
+        if (dm_get_device(ti, argv[0], lc->start, ti->len,
+                          dm_table_get_mode(ti->table), &lc->dev)) {
+                ti->error = "dm-linear: Device lookup failed";
+                goto bad;
+        }
+        ti->private = lc;
+        return 0;
+      bad:
+        kfree(lc);
+        return -EINVAL;
+}
+static void linear_dtr(struct dm_target *ti)
+{
+        struct linear_c *lc = (struct linear_c *) ti->private;
+        dm_put_device(ti, lc->dev);
+        kfree(lc);
+}
+static int linear_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
+{
+        struct linear_c *lc = (struct linear_c *) ti->private;
+        bio->bi_bdev = lc->dev->bdev;
+        bio->bi_sector = lc->start + (bio->bi_sector - ti->begin);
+        return 1;
+}
+static int linear_status(struct dm_target *ti, status_type_t type,
+                         char *result, unsigned int maxlen)
+{
+        struct linear_c *lc = (struct linear_c *) ti->private;
+        switch (type) {
+        case STATUSTYPE_INFO:
+                result[0] = '\0';
+                break;
+        case STATUSTYPE_TABLE:
+                snprintf(result, maxlen, "%s " SECTOR_FORMAT, lc->dev->name,
+                         lc->start);
+                break;
+        }
+        return 0;
+}
+static struct target_type linear_target = {
+        .name   = "linear",
+        .version= {1, 0, 1},
+        .module = THIS_MODULE,
+        .ctr    = linear_ctr,
+        .dtr    = linear_dtr,
+        .map    = linear_map,
+        .status = linear_status,
+};
+int __init dm_linear_init(void)
+{
+        int r = dm_register_target(&linear_target);
+        if (r < 0)
+                DMERR("linear: register failed %d", r);
+        return r;
+}
+void dm_linear_exit(void)
+{
+        int r = dm_unregister_target(&linear_target);
+        if (r < 0)
+                DMERR("linear: unregister failed %d", r);
+}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
new file mode 100644
index 000000000000..e110655eabdb
--- /dev/null
+++ b/drivers/md/dm-log.c
@@ -0,0 +1,711 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the LGPL.
+ */
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include "dm-log.h"
+#include "dm-io.h"
+static LIST_HEAD(_log_types);
+static DEFINE_SPINLOCK(_lock);
+int dm_register_dirty_log_type(struct dirty_log_type *type)
+{
+        spin_lock(&_lock);
+        type->use_count = 0;
+        list_add(&type->list, &_log_types);
+        spin_unlock(&_lock);
+        return 0;
+}
+int dm_unregister_dirty_log_type(struct dirty_log_type *type)
+{
+        spin_lock(&_lock);
+        if (type->use_count)
+                DMWARN("Attempt to unregister a log type that is still in use");
+        else
+                list_del(&type->list);
+        spin_unlock(&_lock);
+        return 0;
+}
+static struct dirty_log_type *get_type(const char *type_name)
+{
+        struct dirty_log_type *type;
+        spin_lock(&_lock);
+        list_for_each_entry (type, &_log_types, list)
+                if (!strcmp(type_name, type->name)) {
+                        if (!type->use_count && !try_module_get(type->module)){
+                                spin_unlock(&_lock);
+                                return NULL;
+                        }
+                        type->use_count++;
+                        spin_unlock(&_lock);
+                        return type;
+                }
+        spin_unlock(&_lock);
+        return NULL;
+}
+static void put_type(struct dirty_log_type *type)
+{
+        spin_lock(&_lock);
+        if (!--type->use_count)
+                module_put(type->module);
+        spin_unlock(&_lock);
+}
+struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti,
+                                      unsigned int argc, char **argv)
+{
+        struct dirty_log_type *type;
+        struct dirty_log *log;
+        log = kmalloc(sizeof(*log), GFP_KERNEL);
+        if (!log)
+                return NULL;
+        type = get_type(type_name);
+        if (!type) {
+                kfree(log);
+                return NULL;
+        }
+        log->type = type;
+        if (type->ctr(log, ti, argc, argv)) {
+                kfree(log);
+                put_type(type);
+                return NULL;
+        }
+        return log;
+}
+void dm_destroy_dirty_log(struct dirty_log *log)
+{
+        log->type->dtr(log);
+        put_type(log->type);
+        kfree(log);
+}
+/*-----------------------------------------------------------------
+ * Persistent and core logs share a lot of their implementation.
+ * FIXME: need a reload method to be called from a resume
+ *---------------------------------------------------------------*/
+/*
+ * Magic for persistent mirrors: "MiRr"
+ */
+#define MIRROR_MAGIC 0x4D695272
+/*
+ * The on-disk version of the metadata.
+ */
+#define MIRROR_DISK_VERSION 1
+#define LOG_OFFSET 2
+struct log_header {
+        uint32_t magic;
+        /*
+         * Simple, incrementing version. no backward
+         * compatibility.
+         */
+        uint32_t version;
+        sector_t nr_regions;
+};
+struct log_c {
+        struct dm_target *ti;
+        int touched;
+        uint32_t region_size;
+        unsigned int region_count;
+        region_t sync_count;
+        unsigned bitset_uint32_count;
+        uint32_t *clean_bits;
+        uint32_t *sync_bits;
+        uint32_t *recovering_bits;      /* FIXME: this seems excessive */
+        int sync_search;
+        /* Resync flag */
+        enum sync {
+                DEFAULTSYNC,    /* Synchronize if necessary */
+                NOSYNC,         /* Devices known to be already in sync */
+                FORCESYNC,      /* Force a sync to happen */
+        } sync;
+        /*
+         * Disk log fields
+         */
+        struct dm_dev *log_dev;
+        struct log_header header;
+        struct io_region header_location;
+        struct log_header *disk_header;
+        struct io_region bits_location;
+        uint32_t *disk_bits;
+};
+/*
+ * The touched member needs to be updated every time we access
+ * one of the bitsets.
+ */
+static  inline int log_test_bit(uint32_t *bs, unsigned bit)
+{
+        return test_bit(bit, (unsigned long *) bs) ? 1 : 0;
+}
+static inline void log_set_bit(struct log_c *l,
+                               uint32_t *bs, unsigned bit)
+{
+        set_bit(bit, (unsigned long *) bs);
+        l->touched = 1;
+}
+static inline void log_clear_bit(struct log_c *l,
+                                 uint32_t *bs, unsigned bit)
+{
+        clear_bit(bit, (unsigned long *) bs);
+        l->touched = 1;
+}
+/*----------------------------------------------------------------
+ * Header IO
+ *--------------------------------------------------------------*/
+static void header_to_disk(struct log_header *core, struct log_header *disk)
+{
+        disk->magic = cpu_to_le32(core->magic);
+        disk->version = cpu_to_le32(core->version);
+        disk->nr_regions = cpu_to_le64(core->nr_regions);
+}
+static void header_from_disk(struct log_header *core, struct log_header *disk)
+{
+        core->magic = le32_to_cpu(disk->magic);
+        core->version = le32_to_cpu(disk->version);
+        core->nr_regions = le64_to_cpu(disk->nr_regions);
+}
+static int read_header(struct log_c *log)
+{
+        int r;
+        unsigned long ebits;
+        r = dm_io_sync_vm(1, &log->header_location, READ,
+                          log->disk_header, &ebits);
+        if (r)
+                return r;
+        header_from_disk(&log->header, log->disk_header);
+        /* New log required? */
+        if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) {
+                log->header.magic = MIRROR_MAGIC;
+                log->header.version = MIRROR_DISK_VERSION;
+                log->header.nr_regions = 0;
+        }
+        if (log->header.version != MIRROR_DISK_VERSION) {
+                DMWARN("incompatible disk log version");
+                return -EINVAL;
+        }
+        return 0;
+}
+static inline int write_header(struct log_c *log)
+{
+        unsigned long ebits;
+        header_to_disk(&log->header, log->disk_header);
+        return dm_io_sync_vm(1, &log->header_location, WRITE,
+                             log->disk_header, &ebits);
+}
+/*----------------------------------------------------------------
+ * Bits IO
+ *--------------------------------------------------------------*/
+static inline void bits_to_core(uint32_t *core, uint32_t *disk, unsigned count)
+{
+        unsigned i;
+        for (i = 0; i < count; i++)
+                core[i] = le32_to_cpu(disk[i]);
+}
+static inline void bits_to_disk(uint32_t *core, uint32_t *disk, unsigned count)
+{
+        unsigned i;
+        /* copy across the clean/dirty bitset */
+        for (i = 0; i < count; i++)
+                disk[i] = cpu_to_le32(core[i]);
+}
+static int read_bits(struct log_c *log)
+{
+        int r;
+        unsigned long ebits;
+        r = dm_io_sync_vm(1, &log->bits_location, READ,
+                          log->disk_bits, &ebits);
+        if (r)
+                return r;
+        bits_to_core(log->clean_bits, log->disk_bits,
+                     log->bitset_uint32_count);
+        return 0;
+}
+static int write_bits(struct log_c *log)
+{
+        unsigned long ebits;
+        bits_to_disk(log->clean_bits, log->disk_bits,
+                     log->bitset_uint32_count);
+        return dm_io_sync_vm(1, &log->bits_location, WRITE,
+                             log->disk_bits, &ebits);
+}
+/*----------------------------------------------------------------
+ * core log constructor/destructor
+ *
+ * argv contains region_size followed optionally by [no]sync
+ *--------------------------------------------------------------*/
+#define BYTE_SHIFT 3
+static int core_ctr(struct dirty_log *log, struct dm_target *ti,
+                    unsigned int argc, char **argv)
+{
+        enum sync sync = DEFAULTSYNC;
+        struct log_c *lc;
+        uint32_t region_size;
+        unsigned int region_count;
+        size_t bitset_size;
+        if (argc < 1 || argc > 2) {
+                DMWARN("wrong number of arguments to mirror log");
+                return -EINVAL;
+        }
+        if (argc > 1) {
+                if (!strcmp(argv[1], "sync"))
+                        sync = FORCESYNC;
+                else if (!strcmp(argv[1], "nosync"))
+                        sync = NOSYNC;
+                else {
+                        DMWARN("unrecognised sync argument to mirror log: %s",
+                               argv[1]);
+                        return -EINVAL;
+                }
+        }
+        if (sscanf(argv[0], "%u", &region_size) != 1) {
+                DMWARN("invalid region size string");
+                return -EINVAL;
+        }
+        region_count = dm_sector_div_up(ti->len, region_size);
+        lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+        if (!lc) {
+                DMWARN("couldn't allocate core log");
+                return -ENOMEM;
+        }
+        lc->ti = ti;
+        lc->touched = 0;
+        lc->region_size = region_size;
+        lc->region_count = region_count;
+        lc->sync = sync;
+        /*
+         * Work out how many words we need to hold the bitset.
+         */
+        bitset_size = dm_round_up(region_count,
+                                  sizeof(*lc->clean_bits) << BYTE_SHIFT);
+        bitset_size >>= BYTE_SHIFT;
+        lc->bitset_uint32_count = bitset_size / 4;
+        lc->clean_bits = vmalloc(bitset_size);
+        if (!lc->clean_bits) {
+                DMWARN("couldn't allocate clean bitset");
+                kfree(lc);
+                return -ENOMEM;
+        }
+        memset(lc->clean_bits, -1, bitset_size);
+        lc->sync_bits = vmalloc(bitset_size);
+        if (!lc->sync_bits) {
+                DMWARN("couldn't allocate sync bitset");
+                vfree(lc->clean_bits);
+                kfree(lc);
+                return -ENOMEM;
+        }
+        memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
+        lc->sync_count = (sync == NOSYNC) ? region_count : 0;
+        lc->recovering_bits = vmalloc(bitset_size);
+        if (!lc->recovering_bits) {
+                DMWARN("couldn't allocate sync bitset");
+                vfree(lc->sync_bits);
+                vfree(lc->clean_bits);
+                kfree(lc);
+                return -ENOMEM;
+        }
+        memset(lc->recovering_bits, 0, bitset_size);
+        lc->sync_search = 0;
+        log->context = lc;
+        return 0;
+}
+static void core_dtr(struct dirty_log *log)
+{
+        struct log_c *lc = (struct log_c *) log->context;
+        vfree(lc->clean_bits);
+        vfree(lc->sync_bits);
+        vfree(lc->recovering_bits);
+        kfree(lc);
+}
+/*----------------------------------------------------------------
+ * disk log constructor/destructor
+ *
+ * argv contains log_device region_size followed optionally by [no]sync
+ *--------------------------------------------------------------*/
+static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
+                    unsigned int argc, char **argv)
+{
+        int r;
+        size_t size;
+        struct log_c *lc;
+        struct dm_dev *dev;
+        if (argc < 2 || argc > 3) {
+                DMWARN("wrong number of arguments to disk mirror log");
+                return -EINVAL;
+        }
+        r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */,
+                          FMODE_READ | FMODE_WRITE, &dev);
+        if (r)
+                return r;
+        r = core_ctr(log, ti, argc - 1, argv + 1);
+        if (r) {
+                dm_put_device(ti, dev);
+                return r;
+        }
+        lc = (struct log_c *) log->context;
+        lc->log_dev = dev;
+        /* setup the disk header fields */
+        lc->header_location.bdev = lc->log_dev->bdev;
+        lc->header_location.sector = 0;
+        lc->header_location.count = 1;
+        /*
+         * We can't read less than this amount, even though we'll
+         * not be using most of this space.
+         */
+        lc->disk_header = vmalloc(1 << SECTOR_SHIFT);
+        if (!lc->disk_header)
+                goto bad;
+        /* setup the disk bitset fields */
+        lc->bits_location.bdev = lc->log_dev->bdev;
+        lc->bits_location.sector = LOG_OFFSET;
+        size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t),
+                           1 << SECTOR_SHIFT);
+        lc->bits_location.count = size >> SECTOR_SHIFT;
+        lc->disk_bits = vmalloc(size);
+        if (!lc->disk_bits) {
+                vfree(lc->disk_header);
+                goto bad;
+        }
+        return 0;
+ bad:
+        dm_put_device(ti, lc->log_dev);
+        core_dtr(log);
+        return -ENOMEM;
+}
+static void disk_dtr(struct dirty_log *log)
+{
+        struct log_c *lc = (struct log_c *) log->context;
+        dm_put_device(lc->ti, lc->log_dev);
+        vfree(lc->disk_header);
+        vfree(lc->disk_bits);
+        core_dtr(log);
+}
+static int count_bits32(uint32_t *addr, unsigned size)
+{
+        int count = 0, i;
+        for (i = 0; i < size; i++) {
+                count += hweight32(*(addr+i));
+        }
+        return count;
+}
+static int disk_resume(struct dirty_log *log)
+{
+        int r;
+        unsigned i;
+        struct log_c *lc = (struct log_c *) log->context;
+        size_t size = lc->bitset_uint32_count * sizeof(uint32_t);
+        /* read the disk header */
+        r = read_header(lc);
+        if (r)
+                return r;
+        /* read the bits */
+        r = read_bits(lc);
+        if (r)
+                return r;
+        /* set or clear any new bits */
+        if (lc->sync == NOSYNC)
+                for (i = lc->header.nr_regions; i < lc->region_count; i++)
+                        /* FIXME: amazingly inefficient */
+                        log_set_bit(lc, lc->clean_bits, i);
+        else
+                for (i = lc->header.nr_regions; i < lc->region_count; i++)
+                        /* FIXME: amazingly inefficient */
+                        log_clear_bit(lc, lc->clean_bits, i);
+        /* copy clean across to sync */
+        memcpy(lc->sync_bits, lc->clean_bits, size);
+        lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
+        /* write the bits */
+        r = write_bits(lc);
+        if (r)
+                return r;
+        /* set the correct number of regions in the header */
+        lc->header.nr_regions = lc->region_count;
+        /* write the new header */
+        return write_header(lc);
+}
+static uint32_t core_get_region_size(struct dirty_log *log)
+{
+        struct log_c *lc = (struct log_c *) log->context;
+        return lc->region_size;
+}
+static int core_is_clean(struct dirty_log *log, region_t region)
+{
+        struct log_c *lc = (struct log_c *) log->context;
+        return log_test_bit(lc->clean_bits, region);
+}
+static int core_in_sync(struct dirty_log *log, region_t region, int block)
+{
+        struct log_c *lc = (struct log_c *) log->context;
+        return log_test_bit(lc->sync_bits, region);
+}
+static int core_flush(struct dirty_log *log)
+{
+        /* no op */
+        return 0;
+}
+static int disk_flush(struct dirty_log *log)
+{
+        int r;
+        struct log_c *lc = (struct log_c *) log->context;
+        /* only write if the log has changed */
+        if (!lc->touched)
+                return 0;
+        r = write_bits(lc);
+        if (!r)
+                lc->touched = 0;
+        return r;
+}
+static void core_mark_region(struct dirty_log *log, region_t region)
+{
+        struct log_c *lc = (struct log_c *) log->context;
+        log_clear_bit(lc, lc->clean_bits, region);
+}
+static void core_clear_region(struct dirty_log *log, region_t region)
+{
+        struct log_c *lc = (struct log_c *) log->context;
+        log_set_bit(lc, lc->clean_bits, region);
+}
+static int core_get_resync_work(struct dirty_log *log, region_t *region)
+{
+        struct log_c *lc = (struct log_c *) log->context;
+        if (lc->sync_search >= lc->region_count)
+                return 0;
+        do {
+                *region = find_next_zero_bit((unsigned long *) lc->sync_bits,
+                                             lc->region_count,
+                                             lc->sync_search);
+                lc->sync_search = *region + 1;
+                if (*region == lc->region_count)
+                        return 0;
+        } while (log_test_bit(lc->recovering_bits, *region));
+        log_set_bit(lc, lc->recovering_bits, *region);
+        return 1;
+}
+static void core_complete_resync_work(struct dirty_log *log, region_t region,
+                                      int success)
+{
+        struct log_c *lc = (struct log_c *) log->context;
+        log_clear_bit(lc, lc->recovering_bits, region);
+        if (success) {
+                log_set_bit(lc, lc->sync_bits, region);
+                lc->sync_count++;
+        }
+}
+static region_t core_get_sync_count(struct dirty_log *log)
+{
+        struct log_c *lc = (struct log_c *) log->context;
+        return lc->sync_count;
+}
+#define DMEMIT_SYNC \
+        if (lc->sync != DEFAULTSYNC) \
+                DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "")
+static int core_status(struct dirty_log *log, status_type_t status,
+                       char *result, unsigned int maxlen)
+{
+        int sz = 0;
+        struct log_c *lc = log->context;
+        switch(status) {
+        case STATUSTYPE_INFO:
+                break;
+        case STATUSTYPE_TABLE:
+                DMEMIT("%s %u %u ", log->type->name,
+                       lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size);
+                DMEMIT_SYNC;
+        }
+        return sz;
+}
+static int disk_status(struct dirty_log *log, status_type_t status,
+                       char *result, unsigned int maxlen)
+{
+        int sz = 0;
+        char buffer[16];
+        struct log_c *lc = log->context;
+        switch(status) {
+        case STATUSTYPE_INFO:
+                break;
+        case STATUSTYPE_TABLE:
+                format_dev_t(buffer, lc->log_dev->bdev->bd_dev);
+                DMEMIT("%s %u %s %u ", log->type->name,
+                       lc->sync == DEFAULTSYNC ? 2 : 3, buffer,
+                       lc->region_size);
+                DMEMIT_SYNC;
+        }
+        return sz;
+}
+static struct dirty_log_type _core_type = {
+        .name = "core",
+        .module = THIS_MODULE,
+        .ctr = core_ctr,
+        .dtr = core_dtr,
+        .get_region_size = core_get_region_size,
+        .is_clean = core_is_clean,
+        .in_sync = core_in_sync,
+        .flush = core_flush,
+        .mark_region = core_mark_region,
+        .clear_region = core_clear_region,
+        .get_resync_work = core_get_resync_work,
+        .complete_resync_work = core_complete_resync_work,
+        .get_sync_count = core_get_sync_count,
+        .status = core_status,
+};
+static struct dirty_log_type _disk_type = {
+        .name = "disk",
+        .module = THIS_MODULE,
+        .ctr = disk_ctr,
+        .dtr = disk_dtr,
+        .suspend = disk_flush,
+        .resume = disk_resume,
+        .get_region_size = core_get_region_size,
+        .is_clean = core_is_clean,
+        .in_sync = core_in_sync,
+        .flush = disk_flush,
+        .mark_region = core_mark_region,
+        .clear_region = core_clear_region,
+        .get_resync_work = core_get_resync_work,
+        .complete_resync_work = core_complete_resync_work,
+        .get_sync_count = core_get_sync_count,
+        .status = disk_status,
+};
+int __init dm_dirty_log_init(void)
+{
+        int r;
+        r = dm_register_dirty_log_type(&_core_type);
+        if (r)
+                DMWARN("couldn't register core log");
+        r = dm_register_dirty_log_type(&_disk_type);
+        if (r) {
+                DMWARN("couldn't register disk type");
+                dm_unregister_dirty_log_type(&_core_type);
+        }
+        return r;
+}
+void dm_dirty_log_exit(void)
+{
+        dm_unregister_dirty_log_type(&_disk_type);
+        dm_unregister_dirty_log_type(&_core_type);
+}
+EXPORT_SYMBOL(dm_register_dirty_log_type);
+EXPORT_SYMBOL(dm_unregister_dirty_log_type);
+EXPORT_SYMBOL(dm_create_dirty_log);
+EXPORT_SYMBOL(dm_destroy_dirty_log);
diff --git a/drivers/md/dm-log.h b/drivers/md/dm-log.h
new file mode 100644
index 000000000000..5ae5309ebf28
--- /dev/null
+++ b/drivers/md/dm-log.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the LGPL.
+ */
+#ifndef DM_DIRTY_LOG
+#define DM_DIRTY_LOG
+#include "dm.h"
+typedef sector_t region_t;
+struct dirty_log_type;
+struct dirty_log {
+        struct dirty_log_type *type;
+        void *context;
+};
+struct dirty_log_type {
+        struct list_head list;
+        const char *name;
+        struct module *module;
+        unsigned int use_count;
+        int (*ctr)(struct dirty_log *log, struct dm_target *ti,
+                   unsigned int argc, char **argv);
+        void (*dtr)(struct dirty_log *log);
+        /*
+         * There are times when we don't want the log to touch
+         * the disk.
+         */
+        int (*suspend)(struct dirty_log *log);
+        int (*resume)(struct dirty_log *log);
+        /*
+         * Retrieves the smallest size of region that the log can
+         * deal with.
+         */
+        uint32_t (*get_region_size)(struct dirty_log *log);
+        /*
+         * A predicate to say whether a region is clean or not.
+         * May block.
+         */
+        int (*is_clean)(struct dirty_log *log, region_t region);
+        /*
+         *  Returns: 0, 1, -EWOULDBLOCK, < 0
+         *
+         * A predicate function to check the area given by
+         * [sector, sector + len) is in sync.
+         *
+         * If -EWOULDBLOCK is returned the state of the region is
+         * unknown, typically this will result in a read being
+         * passed to a daemon to deal with, since a daemon is
+         * allowed to block.
+         */
+        int (*in_sync)(struct dirty_log *log, region_t region, int can_block);
+        /*
+         * Flush the current log state (eg, to disk).  This
+         * function may block.
+         */
+        int (*flush)(struct dirty_log *log);
+        /*
+         * Mark an area as clean or dirty.  These functions may
+         * block, though for performance reasons blocking should
+         * be extremely rare (eg, allocating another chunk of
+         * memory for some reason).
+         */
+        void (*mark_region)(struct dirty_log *log, region_t region);
+        void (*clear_region)(struct dirty_log *log, region_t region);
+        /*
+         * Returns: <0 (error), 0 (no region), 1 (region)
+         *
+         * The mirrord will need perform recovery on regions of
+         * the mirror that are in the NOSYNC state.  This
+         * function asks the log to tell the caller about the
+         * next region that this machine should recover.
+         *
+         * Do not confuse this function with 'in_sync()', one
+         * tells you if an area is synchronised, the other
+         * assigns recovery work.
+        */
+        int (*get_resync_work)(struct dirty_log *log, region_t *region);
+        /*
+         * This notifies the log that the resync of an area has
+         * been completed.  The log should then mark this region
+         * as CLEAN.
+         */
+        void (*complete_resync_work)(struct dirty_log *log,
+                                     region_t region, int success);
+        /*
+         * Returns the number of regions that are in sync.
+         */
+        region_t (*get_sync_count)(struct dirty_log *log);
+        /*
+         * Support function for mirror status requests.
+         */
+        int (*status)(struct dirty_log *log, status_type_t status_type,
+                      char *result, unsigned int maxlen);
+};
+int dm_register_dirty_log_type(struct dirty_log_type *type);
+int dm_unregister_dirty_log_type(struct dirty_log_type *type);
+/*
+ * Make sure you use these two functions, rather than calling
+ * type->constructor/destructor() directly.
+ */
+struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti,
+                                      unsigned int argc, char **argv);
+void dm_destroy_dirty_log(struct dirty_log *log);
+/*
+ * init/exit functions.
+ */
+int dm_dirty_log_init(void);
+void dm_dirty_log_exit(void);
+#endif
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
new file mode 100644
index 000000000000..43763a0bd096
--- /dev/null
+++ b/drivers/md/dm-mpath.c
@@ -0,0 +1,1302 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm.h"
+#include "dm-path-selector.h"
+#include "dm-hw-handler.h"
+#include "dm-bio-list.h"
+#include "dm-bio-record.h"
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/workqueue.h>
+#include <asm/atomic.h>
+#define MESG_STR(x) x, sizeof(x)
+/* Path properties */
+struct pgpath {
+        struct list_head list;
+        struct priority_group *pg;      /* Owning PG */
+        unsigned fail_count;            /* Cumulative failure count */
+        struct path path;
+};
+#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
+/*
+ * Paths are grouped into Priority Groups and numbered from 1 upwards.
+ * Each has a path selector which controls which path gets used.
+ */
+struct priority_group {
+        struct list_head list;
+        struct multipath *m;            /* Owning multipath instance */
+        struct path_selector ps;
+        unsigned pg_num;                /* Reference number */
+        unsigned bypassed;              /* Temporarily bypass this PG? */
+        unsigned nr_pgpaths;            /* Number of paths in PG */
+        struct list_head pgpaths;
+};
+/* Multipath context */
+struct multipath {
+        struct list_head list;
+        struct dm_target *ti;
+        spinlock_t lock;
+        struct hw_handler hw_handler;
+        unsigned nr_priority_groups;
+        struct list_head priority_groups;
+        unsigned pg_init_required;      /* pg_init needs calling? */
+        unsigned nr_valid_paths;        /* Total number of usable paths */
+        struct pgpath *current_pgpath;
+        struct priority_group *current_pg;
+        struct priority_group *next_pg; /* Switch to this PG if set */
+        unsigned repeat_count;          /* I/Os left before calling PS again */
+        unsigned queue_io;              /* Must we queue all I/O? */
+        unsigned queue_if_no_path;      /* Queue I/O if last path fails? */
+        unsigned suspended;             /* Has dm core suspended our I/O? */
+        struct work_struct process_queued_ios;
+        struct bio_list queued_ios;
+        unsigned queue_size;
+        struct work_struct trigger_event;
+        /*
+         * We must use a mempool of mpath_io structs so that we
+         * can resubmit bios on error.
+         */
+        mempool_t *mpio_pool;
+};
+/*
+ * Context information attached to each bio we process.
+ */
+struct mpath_io {
+        struct pgpath *pgpath;
+        struct dm_bio_details details;
+};
+typedef int (*action_fn) (struct pgpath *pgpath);
+#define MIN_IOS 256     /* Mempool size */
+static kmem_cache_t *_mpio_cache;
+static void process_queued_ios(void *data);
+static void trigger_event(void *data);
+/*-----------------------------------------------
+ * Allocation routines
+ *-----------------------------------------------*/
+static struct pgpath *alloc_pgpath(void)
+{
+        struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL);
+        if (pgpath) {
+                memset(pgpath, 0, sizeof(*pgpath));
+                pgpath->path.is_active = 1;
+        }
+        return pgpath;
+}
+static inline void free_pgpath(struct pgpath *pgpath)
+{
+        kfree(pgpath);
+}
+static struct priority_group *alloc_priority_group(void)
+{
+        struct priority_group *pg;
+        pg = kmalloc(sizeof(*pg), GFP_KERNEL);
+        if (!pg)
+                return NULL;
+        memset(pg, 0, sizeof(*pg));
+        INIT_LIST_HEAD(&pg->pgpaths);
+        return pg;
+}
+static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
+{
+        struct pgpath *pgpath, *tmp;
+        list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
+                list_del(&pgpath->list);
+                dm_put_device(ti, pgpath->path.dev);
+                free_pgpath(pgpath);
+        }
+}
+static void free_priority_group(struct priority_group *pg,
+                                struct dm_target *ti)
+{
+        struct path_selector *ps = &pg->ps;
+        if (ps->type) {
+                ps->type->destroy(ps);
+                dm_put_path_selector(ps->type);
+        }
+        free_pgpaths(&pg->pgpaths, ti);
+        kfree(pg);
+}
+static struct multipath *alloc_multipath(void)
+{
+        struct multipath *m;
+        m = kmalloc(sizeof(*m), GFP_KERNEL);
+        if (m) {
+                memset(m, 0, sizeof(*m));
+                INIT_LIST_HEAD(&m->priority_groups);
+                spin_lock_init(&m->lock);
+                m->queue_io = 1;
+                INIT_WORK(&m->process_queued_ios, process_queued_ios, m);
+                INIT_WORK(&m->trigger_event, trigger_event, m);
+                m->mpio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
+                                              mempool_free_slab, _mpio_cache);
+                if (!m->mpio_pool) {
+                        kfree(m);
+                        return NULL;
+                }
+        }
+        return m;
+}
+static void free_multipath(struct multipath *m)
+{
+        struct priority_group *pg, *tmp;
+        struct hw_handler *hwh = &m->hw_handler;
+        list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
+                list_del(&pg->list);
+                free_priority_group(pg, m->ti);
+        }
+        if (hwh->type) {
+                hwh->type->destroy(hwh);
+                dm_put_hw_handler(hwh->type);
+        }
+        mempool_destroy(m->mpio_pool);
+        kfree(m);
+}
+/*-----------------------------------------------
+ * Path selection
+ *-----------------------------------------------*/
+static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
+{
+        struct hw_handler *hwh = &m->hw_handler;
+        m->current_pg = pgpath->pg;
+        /* Must we initialise the PG first, and queue I/O till it's ready? */
+        if (hwh->type && hwh->type->pg_init) {
+                m->pg_init_required = 1;
+                m->queue_io = 1;
+        } else {
+                m->pg_init_required = 0;
+                m->queue_io = 0;
+        }
+}
+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
+{
+        struct path *path;
+        path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
+        if (!path)
+                return -ENXIO;
+        m->current_pgpath = path_to_pgpath(path);
+        if (m->current_pg != pg)
+                __switch_pg(m, m->current_pgpath);
+        return 0;
+}
+static void __choose_pgpath(struct multipath *m)
+{
+        struct priority_group *pg;
+        unsigned bypassed = 1;
+        if (!m->nr_valid_paths)
+                goto failed;
+        /* Were we instructed to switch PG? */
+        if (m->next_pg) {
+                pg = m->next_pg;
+                m->next_pg = NULL;
+                if (!__choose_path_in_pg(m, pg))
+                        return;
+        }
+        /* Don't change PG until it has no remaining paths */
+        if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
+                return;
+        /*
+         * Loop through priority groups until we find a valid path.
+         * First time we skip PGs marked 'bypassed'.
+         * Second time we only try the ones we skipped.
+         */
+        do {
+                list_for_each_entry(pg, &m->priority_groups, list) {
+                        if (pg->bypassed == bypassed)
+                                continue;
+                        if (!__choose_path_in_pg(m, pg))
+                                return;
+                }
+        } while (bypassed--);
+failed:
+        m->current_pgpath = NULL;
+        m->current_pg = NULL;
+}
+static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio,
+                  unsigned was_queued)
+{
+        int r = 1;
+        unsigned long flags;
+        struct pgpath *pgpath;
+        spin_lock_irqsave(&m->lock, flags);
+        /* Do we need to select a new pgpath? */
+        if (!m->current_pgpath ||
+            (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
+                __choose_pgpath(m);
+        pgpath = m->current_pgpath;
+        if (was_queued)
+                m->queue_size--;
+        if ((pgpath && m->queue_io) ||
+            (!pgpath && m->queue_if_no_path && !m->suspended)) {
+                /* Queue for the daemon to resubmit */
+                bio_list_add(&m->queued_ios, bio);
+                m->queue_size++;
+                if (m->pg_init_required || !m->queue_io)
+                        schedule_work(&m->process_queued_ios);
+                pgpath = NULL;
+                r = 0;
+        } else if (!pgpath)
+                r = -EIO;               /* Failed */
+        else
+                bio->bi_bdev = pgpath->path.dev->bdev;
+        mpio->pgpath = pgpath;
+        spin_unlock_irqrestore(&m->lock, flags);
+        return r;
+}
+/*
+ * If we run out of usable paths, should we queue I/O or error it?
+ */
+static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&m->lock, flags);
+        m->queue_if_no_path = queue_if_no_path;
+        if (!m->queue_if_no_path)
+                schedule_work(&m->process_queued_ios);
+        spin_unlock_irqrestore(&m->lock, flags);
+        return 0;
+}
+/*-----------------------------------------------------------------
+ * The multipath daemon is responsible for resubmitting queued ios.
+ *---------------------------------------------------------------*/
+static void dispatch_queued_ios(struct multipath *m)
+{
+        int r;
+        unsigned long flags;
+        struct bio *bio = NULL, *next;
+        struct mpath_io *mpio;
+        union map_info *info;
+        spin_lock_irqsave(&m->lock, flags);
+        bio = bio_list_get(&m->queued_ios);
+        spin_unlock_irqrestore(&m->lock, flags);
+        while (bio) {
+                next = bio->bi_next;
+                bio->bi_next = NULL;
+                info = dm_get_mapinfo(bio);
+                mpio = info->ptr;
+                r = map_io(m, bio, mpio, 1);
+                if (r < 0)
+                        bio_endio(bio, bio->bi_size, r);
+                else if (r == 1)
+                        generic_make_request(bio);
+                bio = next;
+        }
+}
+static void process_queued_ios(void *data)
+{
+        struct multipath *m = (struct multipath *) data;
+        struct hw_handler *hwh = &m->hw_handler;
+        struct pgpath *pgpath;
+        unsigned init_required, must_queue = 0;
+        unsigned long flags;
+        spin_lock_irqsave(&m->lock, flags);
+        if (!m->current_pgpath)
+                __choose_pgpath(m);
+        pgpath = m->current_pgpath;
+        if ((pgpath && m->queue_io) ||
+            (!pgpath && m->queue_if_no_path && !m->suspended))
+                must_queue = 1;
+        init_required = m->pg_init_required;
+        if (init_required)
+                m->pg_init_required = 0;
+        spin_unlock_irqrestore(&m->lock, flags);
+        if (init_required)
+                hwh->type->pg_init(hwh, pgpath->pg->bypassed, &pgpath->path);
+        if (!must_queue)
+                dispatch_queued_ios(m);
+}
+/*
+ * An event is triggered whenever a path is taken out of use.
+ * Includes path failure and PG bypass.
+ */
+static void trigger_event(void *data)
+{
+        struct multipath *m = (struct multipath *) data;
+        dm_table_event(m->ti->table);
+}
+/*-----------------------------------------------------------------
+ * Constructor/argument parsing:
+ * <#multipath feature args> [<arg>]*
+ * <#hw_handler args> [hw_handler [<arg>]*]
+ * <#priority groups>
+ * <initial priority group>
+ *     [<selector> <#selector args> [<arg>]*
+ *      <#paths> <#per-path selector args>
+ *         [<path> [<arg>]* ]+ ]+
+ *---------------------------------------------------------------*/
+struct param {
+        unsigned min;
+        unsigned max;
+        char *error;
+};
+#define ESTR(s) ("dm-multipath: " s)
+static int read_param(struct param *param, char *str, unsigned *v, char **error)
+{
+        if (!str ||
+            (sscanf(str, "%u", v) != 1) ||
+            (*v < param->min) ||
+            (*v > param->max)) {
+                *error = param->error;
+                return -EINVAL;
+        }
+        return 0;
+}
+struct arg_set {
+        unsigned argc;
+        char **argv;
+};
+static char *shift(struct arg_set *as)
+{
+        char *r;
+        if (as->argc) {
+                as->argc--;
+                r = *as->argv;
+                as->argv++;
+                return r;
+        }
+        return NULL;
+}
+static void consume(struct arg_set *as, unsigned n)
+{
+        BUG_ON (as->argc < n);
+        as->argc -= n;
+        as->argv += n;
+}
+static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
+                               struct dm_target *ti)
+{
+        int r;
+        struct path_selector_type *pst;
+        unsigned ps_argc;
+        static struct param _params[] = {
+                {0, 1024, ESTR("invalid number of path selector args")},
+        };
+        pst = dm_get_path_selector(shift(as));
+        if (!pst) {
+                ti->error = ESTR("unknown path selector type");
+                return -EINVAL;
+        }
+        r = read_param(_params, shift(as), &ps_argc, &ti->error);
+        if (r)
+                return -EINVAL;
+        r = pst->create(&pg->ps, ps_argc, as->argv);
+        if (r) {
+                dm_put_path_selector(pst);
+                ti->error = ESTR("path selector constructor failed");
+                return r;
+        }
+        pg->ps.type = pst;
+        consume(as, ps_argc);
+        return 0;
+}
+static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
+                               struct dm_target *ti)
+{
+        int r;
+        struct pgpath *p;
+        /* we need at least a path arg */
+        if (as->argc < 1) {
+                ti->error = ESTR("no device given");
+                return NULL;
+        }
+        p = alloc_pgpath();
+        if (!p)
+                return NULL;
+        r = dm_get_device(ti, shift(as), ti->begin, ti->len,
+                          dm_table_get_mode(ti->table), &p->path.dev);
+        if (r) {
+                ti->error = ESTR("error getting device");
+                goto bad;
+        }
+        r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
+        if (r) {
+                dm_put_device(ti, p->path.dev);
+                goto bad;
+        }
+        return p;
+ bad:
+        free_pgpath(p);
+        return NULL;
+}
+static struct priority_group *parse_priority_group(struct arg_set *as,
+                                                   struct multipath *m,
+                                                   struct dm_target *ti)
+{
+        static struct param _params[] = {
+                {1, 1024, ESTR("invalid number of paths")},
+                {0, 1024, ESTR("invalid number of selector args")}
+        };
+        int r;
+        unsigned i, nr_selector_args, nr_params;
+        struct priority_group *pg;
+        if (as->argc < 2) {
+                as->argc = 0;
+                ti->error = ESTR("not enough priority group aruments");
+                return NULL;
+        }
+        pg = alloc_priority_group();
+        if (!pg) {
+                ti->error = ESTR("couldn't allocate priority group");
+                return NULL;
+        }
+        pg->m = m;
+        r = parse_path_selector(as, pg, ti);
+        if (r)
+                goto bad;
+        /*
+         * read the paths
+         */
+        r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
+        if (r)
+                goto bad;
+        r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
+        if (r)
+                goto bad;
+        nr_params = 1 + nr_selector_args;
+        for (i = 0; i < pg->nr_pgpaths; i++) {
+                struct pgpath *pgpath;
+                struct arg_set path_args;
+                if (as->argc < nr_params)
+                        goto bad;
+                path_args.argc = nr_params;
+                path_args.argv = as->argv;
+                pgpath = parse_path(&path_args, &pg->ps, ti);
+                if (!pgpath)
+                        goto bad;
+                pgpath->pg = pg;
+                list_add_tail(&pgpath->list, &pg->pgpaths);
+                consume(as, nr_params);
+        }
+        return pg;
+ bad:
+        free_priority_group(pg, ti);
+        return NULL;
+}
+static int parse_hw_handler(struct arg_set *as, struct multipath *m,
+                            struct dm_target *ti)
+{
+        int r;
+        struct hw_handler_type *hwht;
+        unsigned hw_argc;
+        static struct param _params[] = {
+                {0, 1024, ESTR("invalid number of hardware handler args")},
+        };
+        r = read_param(_params, shift(as), &hw_argc, &ti->error);
+        if (r)
+                return -EINVAL;
+        if (!hw_argc)
+                return 0;
+        hwht = dm_get_hw_handler(shift(as));
+        if (!hwht) {
+                ti->error = ESTR("unknown hardware handler type");
+                return -EINVAL;
+        }
+        r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
+        if (r) {
+                dm_put_hw_handler(hwht);
+                ti->error = ESTR("hardware handler constructor failed");
+                return r;
+        }
+        m->hw_handler.type = hwht;
+        consume(as, hw_argc - 1);
+        return 0;
+}
+static int parse_features(struct arg_set *as, struct multipath *m,
+                          struct dm_target *ti)
+{
+        int r;
+        unsigned argc;
+        static struct param _params[] = {
+                {0, 1, ESTR("invalid number of feature args")},
+        };
+        r = read_param(_params, shift(as), &argc, &ti->error);
+        if (r)
+                return -EINVAL;
+        if (!argc)
+                return 0;
+        if (!strnicmp(shift(as), MESG_STR("queue_if_no_path")))
+                return queue_if_no_path(m, 1);
+        else {
+                ti->error = "Unrecognised multipath feature request";
+                return -EINVAL;
+        }
+}
+static int multipath_ctr(struct dm_target *ti, unsigned int argc,
+                         char **argv)
+{
+        /* target parameters */
+        static struct param _params[] = {
+                {1, 1024, ESTR("invalid number of priority groups")},
+                {1, 1024, ESTR("invalid initial priority group number")},
+        };
+        int r;
+        struct multipath *m;
+        struct arg_set as;
+        unsigned pg_count = 0;
+        unsigned next_pg_num;
+        as.argc = argc;
+        as.argv = argv;
+        m = alloc_multipath();
+        if (!m) {
+                ti->error = ESTR("can't allocate multipath");
+                return -EINVAL;
+        }
+        r = parse_features(&as, m, ti);
+        if (r)
+                goto bad;
+        r = parse_hw_handler(&as, m, ti);
+        if (r)
+                goto bad;
+        r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
+        if (r)
+                goto bad;
+        r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
+        if (r)
+                goto bad;
+        /* parse the priority groups */
+        while (as.argc) {
+                struct priority_group *pg;
+                pg = parse_priority_group(&as, m, ti);
+                if (!pg) {
+                        r = -EINVAL;
+                        goto bad;
+                }
+                m->nr_valid_paths += pg->nr_pgpaths;
+                list_add_tail(&pg->list, &m->priority_groups);
+                pg_count++;
+                pg->pg_num = pg_count;
+                if (!--next_pg_num)
+                        m->next_pg = pg;
+        }
+        if (pg_count != m->nr_priority_groups) {
+                ti->error = ESTR("priority group count mismatch");
+                r = -EINVAL;
+                goto bad;
+        }
+        ti->private = m;
+        m->ti = ti;
+        return 0;
+ bad:
+        free_multipath(m);
+        return r;
+}
+static void multipath_dtr(struct dm_target *ti)
+{
+        struct multipath *m = (struct multipath *) ti->private;
+        free_multipath(m);
+}
+/*
+ * Map bios, recording original fields for later in case we have to resubmit
+ */
+static int multipath_map(struct dm_target *ti, struct bio *bio,
+                         union map_info *map_context)
+{
+        int r;
+        struct mpath_io *mpio;
+        struct multipath *m = (struct multipath *) ti->private;
+        mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
+        dm_bio_record(&mpio->details, bio);
+        map_context->ptr = mpio;
+        bio->bi_rw |= (1 << BIO_RW_FAILFAST);
+        r = map_io(m, bio, mpio, 0);
+        if (r < 0)
+                mempool_free(mpio, m->mpio_pool);
+        return r;
+}
+/*
+ * Take a path out of use.
+ */
+static int fail_path(struct pgpath *pgpath)
+{
+        unsigned long flags;
+        struct multipath *m = pgpath->pg->m;
+        spin_lock_irqsave(&m->lock, flags);
+        if (!pgpath->path.is_active)
+                goto out;
+        DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name);
+        pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
+        pgpath->path.is_active = 0;
+        pgpath->fail_count++;
+        m->nr_valid_paths--;
+        if (pgpath == m->current_pgpath)
+                m->current_pgpath = NULL;
+        schedule_work(&m->trigger_event);
+out:
+        spin_unlock_irqrestore(&m->lock, flags);
+        return 0;
+}
+/*
+ * Reinstate a previously-failed path
+ */
+static int reinstate_path(struct pgpath *pgpath)
+{
+        int r = 0;
+        unsigned long flags;
+        struct multipath *m = pgpath->pg->m;
+        spin_lock_irqsave(&m->lock, flags);
+        if (pgpath->path.is_active)
+                goto out;
+        if (!pgpath->pg->ps.type) {
+                DMWARN("Reinstate path not supported by path selector %s",
+                       pgpath->pg->ps.type->name);
+                r = -EINVAL;
+                goto out;
+        }
+        r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
+        if (r)
+                goto out;
+        pgpath->path.is_active = 1;
+        m->current_pgpath = NULL;
+        if (!m->nr_valid_paths++)
+                schedule_work(&m->process_queued_ios);
+        schedule_work(&m->trigger_event);
+out:
+        spin_unlock_irqrestore(&m->lock, flags);
+        return r;
+}
+/*
+ * Fail or reinstate all paths that match the provided struct dm_dev.
+ */
+static int action_dev(struct multipath *m, struct dm_dev *dev,
+                      action_fn action)
+{
+        int r = 0;
+        struct pgpath *pgpath;
+        struct priority_group *pg;
+        list_for_each_entry(pg, &m->priority_groups, list) {
+                list_for_each_entry(pgpath, &pg->pgpaths, list) {
+                        if (pgpath->path.dev == dev)
+                                r = action(pgpath);
+                }
+        }
+        return r;
+}
+/*
+ * Temporarily try to avoid having to use the specified PG
+ */
+static void bypass_pg(struct multipath *m, struct priority_group *pg,
+                      int bypassed)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&m->lock, flags);
+        pg->bypassed = bypassed;
+        m->current_pgpath = NULL;
+        m->current_pg = NULL;
+        spin_unlock_irqrestore(&m->lock, flags);
+        schedule_work(&m->trigger_event);
+}
+/*
+ * Switch to using the specified PG from the next I/O that gets mapped
+ */
+static int switch_pg_num(struct multipath *m, const char *pgstr)
+{
+        struct priority_group *pg;
+        unsigned pgnum;
+        unsigned long flags;
+        if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
+            (pgnum > m->nr_priority_groups)) {
+                DMWARN("invalid PG number supplied to switch_pg_num");
+                return -EINVAL;
+        }
+        spin_lock_irqsave(&m->lock, flags);
+        list_for_each_entry(pg, &m->priority_groups, list) {
+                pg->bypassed = 0;
+                if (--pgnum)
+                        continue;
+                m->current_pgpath = NULL;
+                m->current_pg = NULL;
+                m->next_pg = pg;
+        }
+        spin_unlock_irqrestore(&m->lock, flags);
+        schedule_work(&m->trigger_event);
+        return 0;
+}
+/*
+ * Set/clear bypassed status of a PG.
+ * PGs are numbered upwards from 1 in the order they were declared.
+ */
+static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
+{
+        struct priority_group *pg;
+        unsigned pgnum;
+        if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
+            (pgnum > m->nr_priority_groups)) {
+                DMWARN("invalid PG number supplied to bypass_pg");
+                return -EINVAL;
+        }
+        list_for_each_entry(pg, &m->priority_groups, list) {
+                if (!--pgnum)
+                        break;
+        }
+        bypass_pg(m, pg, bypassed);
+        return 0;
+}
+/*
+ * pg_init must call this when it has completed its initialisation
+ */
+void dm_pg_init_complete(struct path *path, unsigned err_flags)
+{
+        struct pgpath *pgpath = path_to_pgpath(path);
+        struct priority_group *pg = pgpath->pg;
+        struct multipath *m = pg->m;
+        unsigned long flags;
+        /* We insist on failing the path if the PG is already bypassed. */
+        if (err_flags && pg->bypassed)
+                err_flags |= MP_FAIL_PATH;
+        if (err_flags & MP_FAIL_PATH)
+                fail_path(pgpath);
+        if (err_flags & MP_BYPASS_PG)
+                bypass_pg(m, pg, 1);
+        spin_lock_irqsave(&m->lock, flags);
+        if (!err_flags)
+                m->queue_io = 0;
+        else {
+                m->current_pgpath = NULL;
+                m->current_pg = NULL;
+        }
+        schedule_work(&m->process_queued_ios);
+        spin_unlock_irqrestore(&m->lock, flags);
+}
+/*
+ * end_io handling
+ */
+static int do_end_io(struct multipath *m, struct bio *bio,
+                     int error, struct mpath_io *mpio)
+{
+        struct hw_handler *hwh = &m->hw_handler;
+        unsigned err_flags = MP_FAIL_PATH;      /* Default behavior */
+        if (!error)
+                return 0;       /* I/O complete */
+        spin_lock(&m->lock);
+        if (!m->nr_valid_paths) {
+                if (!m->queue_if_no_path || m->suspended) {
+                        spin_unlock(&m->lock);
+                        return -EIO;
+                } else {
+                        spin_unlock(&m->lock);
+                        goto requeue;
+                }
+        }
+        spin_unlock(&m->lock);
+        if (hwh->type && hwh->type->error)
+                err_flags = hwh->type->error(hwh, bio);
+        if (mpio->pgpath) {
+                if (err_flags & MP_FAIL_PATH)
+                        fail_path(mpio->pgpath);
+                if (err_flags & MP_BYPASS_PG)
+                        bypass_pg(m, mpio->pgpath->pg, 1);
+        }
+        if (err_flags & MP_ERROR_IO)
+                return -EIO;
+      requeue:
+        dm_bio_restore(&mpio->details, bio);
+        /* queue for the daemon to resubmit or fail */
+        spin_lock(&m->lock);
+        bio_list_add(&m->queued_ios, bio);
+        m->queue_size++;
+        if (!m->queue_io)
+                schedule_work(&m->process_queued_ios);
+        spin_unlock(&m->lock);
+        return 1;       /* io not complete */
+}
+static int multipath_end_io(struct dm_target *ti, struct bio *bio,
+                            int error, union map_info *map_context)
+{
+        struct multipath *m = (struct multipath *) ti->private;
+        struct mpath_io *mpio = (struct mpath_io *) map_context->ptr;
+        struct pgpath *pgpath = mpio->pgpath;
+        struct path_selector *ps;
+        int r;
+        r  = do_end_io(m, bio, error, mpio);
+        if (pgpath) {
+                ps = &pgpath->pg->ps;
+                if (ps->type->end_io)
+                        ps->type->end_io(ps, &pgpath->path);
+        }
+        if (r <= 0)
+                mempool_free(mpio, m->mpio_pool);
+        return r;
+}
+/*
+ * Suspend can't complete until all the I/O is processed so if
+ * the last path failed we will now error any queued I/O.
+ */
+static void multipath_presuspend(struct dm_target *ti)
+{
+        struct multipath *m = (struct multipath *) ti->private;
+        unsigned long flags;
+        spin_lock_irqsave(&m->lock, flags);
+        m->suspended = 1;
+        if (m->queue_if_no_path)
+                schedule_work(&m->process_queued_ios);
+        spin_unlock_irqrestore(&m->lock, flags);
+}
+static void multipath_resume(struct dm_target *ti)
+{
+        struct multipath *m = (struct multipath *) ti->private;
+        unsigned long flags;
+        spin_lock_irqsave(&m->lock, flags);
+        m->suspended = 0;
+        spin_unlock_irqrestore(&m->lock, flags);
+}
+/*
+ * Info output has the following format:
+ * num_multipath_feature_args [multipath_feature_args]*
+ * num_handler_status_args [handler_status_args]*
+ * num_groups init_group_number
+ *            [A|D|E num_ps_status_args [ps_status_args]*
+ *             num_paths num_selector_args
+ *             [path_dev A|F fail_count [selector_args]* ]+ ]+
+ *
+ * Table output has the following format (identical to the constructor string):
+ * num_feature_args [features_args]*
+ * num_handler_args hw_handler [hw_handler_args]*
+ * num_groups init_group_number
+ *     [priority selector-name num_ps_args [ps_args]*
+ *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
+ */
+static int multipath_status(struct dm_target *ti, status_type_t type,
+                            char *result, unsigned int maxlen)
+{
+        int sz = 0;
+        unsigned long flags;
+        struct multipath *m = (struct multipath *) ti->private;
+        struct hw_handler *hwh = &m->hw_handler;
+        struct priority_group *pg;
+        struct pgpath *p;
+        unsigned pg_num;
+        char state;
+        spin_lock_irqsave(&m->lock, flags);
+        /* Features */
+        if (type == STATUSTYPE_INFO)
+                DMEMIT("1 %u ", m->queue_size);
+        else if (m->queue_if_no_path)
+                DMEMIT("1 queue_if_no_path ");
+        else
+                DMEMIT("0 ");
+        if (hwh->type && hwh->type->status)
+                sz += hwh->type->status(hwh, type, result + sz, maxlen - sz);
+        else if (!hwh->type || type == STATUSTYPE_INFO)
+                DMEMIT("0 ");
+        else
+                DMEMIT("1 %s ", hwh->type->name);
+        DMEMIT("%u ", m->nr_priority_groups);
+        if (m->next_pg)
+                pg_num = m->next_pg->pg_num;
+        else if (m->current_pg)
+                pg_num = m->current_pg->pg_num;
+        else
+                        pg_num = 1;
+        DMEMIT("%u ", pg_num);
+        switch (type) {
+        case STATUSTYPE_INFO:
+                list_for_each_entry(pg, &m->priority_groups, list) {
+                        if (pg->bypassed)
+                                state = 'D';    /* Disabled */
+                        else if (pg == m->current_pg)
+                                state = 'A';    /* Currently Active */
+                        else
+                                state = 'E';    /* Enabled */
+                        DMEMIT("%c ", state);
+                        if (pg->ps.type->status)
+                                sz += pg->ps.type->status(&pg->ps, NULL, type,
+                                                          result + sz,
+                                                          maxlen - sz);
+                        else
+                                DMEMIT("0 ");
+                        DMEMIT("%u %u ", pg->nr_pgpaths,
+                               pg->ps.type->info_args);
+                        list_for_each_entry(p, &pg->pgpaths, list) {
+                                DMEMIT("%s %s %u ", p->path.dev->name,
+                                       p->path.is_active ? "A" : "F",
+                                       p->fail_count);
+                                if (pg->ps.type->status)
+                                        sz += pg->ps.type->status(&pg->ps,
+                                              &p->path, type, result + sz,
+                                              maxlen - sz);
+                        }
+                }
+                break;
+        case STATUSTYPE_TABLE:
+                list_for_each_entry(pg, &m->priority_groups, list) {
+                        DMEMIT("%s ", pg->ps.type->name);
+                        if (pg->ps.type->status)
+                                sz += pg->ps.type->status(&pg->ps, NULL, type,
+                                                          result + sz,
+                                                          maxlen - sz);
+                        else
+                                DMEMIT("0 ");
+                        DMEMIT("%u %u ", pg->nr_pgpaths,
+                               pg->ps.type->table_args);
+                        list_for_each_entry(p, &pg->pgpaths, list) {
+                                DMEMIT("%s ", p->path.dev->name);
+                                if (pg->ps.type->status)
+                                        sz += pg->ps.type->status(&pg->ps,
+                                              &p->path, type, result + sz,
+                                              maxlen - sz);
+                        }
+                }
+                break;
+        }
+        spin_unlock_irqrestore(&m->lock, flags);
+        return 0;
+}
+static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+        int r;
+        struct dm_dev *dev;
+        struct multipath *m = (struct multipath *) ti->private;
+        action_fn action;
+        if (argc == 1) {
+                if (!strnicmp(argv[0], MESG_STR("queue_if_no_path")))
+                        return queue_if_no_path(m, 1);
+                else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path")))
+                        return queue_if_no_path(m, 0);
+        }
+        if (argc != 2)
+                goto error;
+        if (!strnicmp(argv[0], MESG_STR("disable_group")))
+                return bypass_pg_num(m, argv[1], 1);
+        else if (!strnicmp(argv[0], MESG_STR("enable_group")))
+                return bypass_pg_num(m, argv[1], 0);
+        else if (!strnicmp(argv[0], MESG_STR("switch_group")))
+                return switch_pg_num(m, argv[1]);
+        else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
+                action = reinstate_path;
+        else if (!strnicmp(argv[0], MESG_STR("fail_path")))
+                action = fail_path;
+        else
+                goto error;
+        r = dm_get_device(ti, argv[1], ti->begin, ti->len,
+                          dm_table_get_mode(ti->table), &dev);
+        if (r) {
+                DMWARN("dm-multipath message: error getting device %s",
+                       argv[1]);
+                return -EINVAL;
+        }
+        r = action_dev(m, dev, action);
+        dm_put_device(ti, dev);
+        return r;
+error:
+        DMWARN("Unrecognised multipath message received.");
+        return -EINVAL;
+}
+/*-----------------------------------------------------------------
+ * Module setup
+ *---------------------------------------------------------------*/
+static struct target_type multipath_target = {
+        .name = "multipath",
+        .version = {1, 0, 4},
+        .module = THIS_MODULE,
+        .ctr = multipath_ctr,
+        .dtr = multipath_dtr,
+        .map = multipath_map,
+        .end_io = multipath_end_io,
+        .presuspend = multipath_presuspend,
+        .resume = multipath_resume,
+        .status = multipath_status,
+        .message = multipath_message,
+};
+static int __init dm_multipath_init(void)
+{
+        int r;
+        /* allocate a slab for the dm_ios */
+        _mpio_cache = kmem_cache_create("dm_mpath", sizeof(struct mpath_io),
+                                        0, 0, NULL, NULL);
+        if (!_mpio_cache)
+                return -ENOMEM;
+        r = dm_register_target(&multipath_target);
+        if (r < 0) {
+                DMERR("%s: register failed %d", multipath_target.name, r);
+                kmem_cache_destroy(_mpio_cache);
+                return -EINVAL;
+        }
+        DMINFO("dm-multipath version %u.%u.%u loaded",
+               multipath_target.version[0], multipath_target.version[1],
+               multipath_target.version[2]);
+        return r;
+}
+static void __exit dm_multipath_exit(void)
+{
+        int r;
+        r = dm_unregister_target(&multipath_target);
+        if (r < 0)
+                DMERR("%s: target unregister failed %d",
+                      multipath_target.name, r);
+        kmem_cache_destroy(_mpio_cache);
+}
+EXPORT_SYMBOL_GPL(dm_pg_init_complete);
+module_init(dm_multipath_init);
+module_exit(dm_multipath_exit);
+MODULE_DESCRIPTION(DM_NAME " multipath target");
+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h
new file mode 100644
index 000000000000..8a4bf2b6d52e
--- /dev/null
+++ b/drivers/md/dm-mpath.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Multipath.
+ */
+#ifndef DM_MPATH_H
+#define DM_MPATH_H
+struct dm_dev;
+struct path {
+        struct dm_dev *dev;     /* Read-only */
+        unsigned is_active;     /* Read-only */
+        void *pscontext;        /* For path-selector use */
+        void *hwhcontext;       /* For hw-handler use */
+};
+/* Callback for hwh_pg_init_fn to use when complete */
+void dm_pg_init_complete(struct path *path, unsigned err_flags);
+#endif
diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c
new file mode 100644
index 000000000000..ac5c4bbec6c1
--- /dev/null
+++ b/drivers/md/dm-path-selector.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Path selector registration.
+ */
+#include "dm.h"
+#include "dm-path-selector.h"
+#include <linux/slab.h>
+struct ps_internal {
+        struct path_selector_type pst;
+        struct list_head list;
+        long use;
+};
+#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
+static LIST_HEAD(_path_selectors);
+static DECLARE_RWSEM(_ps_lock);
+struct ps_internal *__find_path_selector_type(const char *name)
+{
+        struct ps_internal *psi;
+        list_for_each_entry(psi, &_path_selectors, list) {
+                if (!strcmp(name, psi->pst.name))
+                        return psi;
+        }
+        return NULL;
+}
+static struct ps_internal *get_path_selector(const char *name)
+{
+        struct ps_internal *psi;
+        down_read(&_ps_lock);
+        psi = __find_path_selector_type(name);
+        if (psi) {
+                if ((psi->use == 0) && !try_module_get(psi->pst.module))
+                        psi = NULL;
+                else
+                        psi->use++;
+        }
+        up_read(&_ps_lock);
+        return psi;
+}
+struct path_selector_type *dm_get_path_selector(const char *name)
+{
+        struct ps_internal *psi;
+        if (!name)
+                return NULL;
+        psi = get_path_selector(name);
+        if (!psi) {
+                request_module("dm-%s", name);
+                psi = get_path_selector(name);
+        }
+        return psi ? &psi->pst : NULL;
+}
+void dm_put_path_selector(struct path_selector_type *pst)
+{
+        struct ps_internal *psi;
+        if (!pst)
+                return;
+        down_read(&_ps_lock);
+        psi = __find_path_selector_type(pst->name);
+        if (!psi)
+                goto out;
+        if (--psi->use == 0)
+                module_put(psi->pst.module);
+        if (psi->use < 0)
+                BUG();
+out:
+        up_read(&_ps_lock);
+}
+static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst)
+{
+        struct ps_internal *psi = kmalloc(sizeof(*psi), GFP_KERNEL);
+        if (psi) {
+                memset(psi, 0, sizeof(*psi));
+                psi->pst = *pst;
+        }
+        return psi;
+}
+int dm_register_path_selector(struct path_selector_type *pst)
+{
+        int r = 0;
+        struct ps_internal *psi = _alloc_path_selector(pst);
+        if (!psi)
+                return -ENOMEM;
+        down_write(&_ps_lock);
+        if (__find_path_selector_type(pst->name)) {
+                kfree(psi);
+                r = -EEXIST;
+        } else
+                list_add(&psi->list, &_path_selectors);
+        up_write(&_ps_lock);
+        return r;
+}
+int dm_unregister_path_selector(struct path_selector_type *pst)
+{
+        struct ps_internal *psi;
+        down_write(&_ps_lock);
+        psi = __find_path_selector_type(pst->name);
+        if (!psi) {
+                up_write(&_ps_lock);
+                return -EINVAL;
+        }
+        if (psi->use) {
+                up_write(&_ps_lock);
+                return -ETXTBSY;
+        }
+        list_del(&psi->list);
+        up_write(&_ps_lock);
+        kfree(psi);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(dm_register_path_selector);
+EXPORT_SYMBOL_GPL(dm_unregister_path_selector);
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
new file mode 100644
index 000000000000..732d06a84f85
--- /dev/null
+++ b/drivers/md/dm-path-selector.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Path-Selector registration.
+ */
+#ifndef DM_PATH_SELECTOR_H
+#define DM_PATH_SELECTOR_H
+#include <linux/device-mapper.h>
+#include "dm-mpath.h"
+/*
+ * We provide an abstraction for the code that chooses which path
+ * to send some io down.
+ */
+struct path_selector_type;
+struct path_selector {
+        struct path_selector_type *type;
+        void *context;
+};
+/* Information about a path selector type */
+struct path_selector_type {
+        char *name;
+        struct module *module;
+        unsigned int table_args;
+        unsigned int info_args;
+        /*
+         * Constructs a path selector object, takes custom arguments
+         */
+        int (*create) (struct path_selector *ps, unsigned argc, char **argv);
+        void (*destroy) (struct path_selector *ps);
+        /*
+         * Add an opaque path object, along with some selector specific
+         * path args (eg, path priority).
+         */
+        int (*add_path) (struct path_selector *ps, struct path *path,
+                         int argc, char **argv, char **error);
+        /*
+         * Chooses a path for this io, if no paths are available then
+         * NULL will be returned.
+         *
+         * repeat_count is the number of times to use the path before
+         * calling the function again.  0 means don't call it again unless
+         * the path fails.
+         */
+        struct path *(*select_path) (struct path_selector *ps,
+                                     unsigned *repeat_count);
+        /*
+         * Notify the selector that a path has failed.
+         */
+        void (*fail_path) (struct path_selector *ps, struct path *p);
+        /*
+         * Ask selector to reinstate a path.
+         */
+        int (*reinstate_path) (struct path_selector *ps, struct path *p);
+        /*
+         * Table content based on parameters added in ps_add_path_fn
+         * or path selector status
+         */
+        int (*status) (struct path_selector *ps, struct path *path,
+                       status_type_t type, char *result, unsigned int maxlen);
+        int (*end_io) (struct path_selector *ps, struct path *path);
+};
+/* Register a path selector */
+int dm_register_path_selector(struct path_selector_type *type);
+/* Unregister a path selector */
+int dm_unregister_path_selector(struct path_selector_type *type);
+/* Returns a registered path selector type */
+struct path_selector_type *dm_get_path_selector(const char *name);
+/* Releases a path selector  */
+void dm_put_path_selector(struct path_selector_type *pst);
+#endif
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
new file mode 100644
index 000000000000..6e3cf7e13451
--- /dev/null
+++ b/drivers/md/dm-raid1.c
@@ -0,0 +1,1269 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm.h"
+#include "dm-bio-list.h"
+#include "dm-io.h"
+#include "dm-log.h"
+#include "kcopyd.h"
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+static struct workqueue_struct *_kmirrord_wq;
+static struct work_struct _kmirrord_work;
+static inline void wake(void)
+{
+        queue_work(_kmirrord_wq, &_kmirrord_work);
+}
+/*-----------------------------------------------------------------
+ * Region hash
+ *
+ * The mirror splits itself up into discrete regions.  Each
+ * region can be in one of three states: clean, dirty,
+ * nosync.  There is no need to put clean regions in the hash.
+ *
+ * In addition to being present in the hash table a region _may_
+ * be present on one of three lists.
+ *
+ *   clean_regions: Regions on this list have no io pending to
+ *   them, they are in sync, we are no longer interested in them,
+ *   they are dull.  rh_update_states() will remove them from the
+ *   hash table.
+ *
+ *   quiesced_regions: These regions have been spun down, ready
+ *   for recovery.  rh_recovery_start() will remove regions from
+ *   this list and hand them to kmirrord, which will schedule the
+ *   recovery io with kcopyd.
+ *
+ *   recovered_regions: Regions that kcopyd has successfully
+ *   recovered.  rh_update_states() will now schedule any delayed
+ *   io, up the recovery_count, and remove the region from the
+ *   hash.
+ *
+ * There are 2 locks:
+ *   A rw spin lock 'hash_lock' protects just the hash table,
+ *   this is never held in write mode from interrupt context,
+ *   which I believe means that we only have to disable irqs when
+ *   doing a write lock.
+ *
+ *   An ordinary spin lock 'region_lock' that protects the three
+ *   lists in the region_hash, with the 'state', 'list' and
+ *   'bhs_delayed' fields of the regions.  This is used from irq
+ *   context, so all other uses will have to suspend local irqs.
+ *---------------------------------------------------------------*/
+struct mirror_set;
+struct region_hash {
+        struct mirror_set *ms;
+        uint32_t region_size;
+        unsigned region_shift;
+        /* holds persistent region state */
+        struct dirty_log *log;
+        /* hash table */
+        rwlock_t hash_lock;
+        mempool_t *region_pool;
+        unsigned int mask;
+        unsigned int nr_buckets;
+        struct list_head *buckets;
+        spinlock_t region_lock;
+        struct semaphore recovery_count;
+        struct list_head clean_regions;
+        struct list_head quiesced_regions;
+        struct list_head recovered_regions;
+};
+enum {
+        RH_CLEAN,
+        RH_DIRTY,
+        RH_NOSYNC,
+        RH_RECOVERING
+};
+struct region {
+        struct region_hash *rh; /* FIXME: can we get rid of this ? */
+        region_t key;
+        int state;
+        struct list_head hash_list;
+        struct list_head list;
+        atomic_t pending;
+        struct bio_list delayed_bios;
+};
+/*
+ * Conversion fns
+ */
+static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio)
+{
+        return bio->bi_sector >> rh->region_shift;
+}
+static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
+{
+        return region << rh->region_shift;
+}
+/* FIXME move this */
+static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
+static void *region_alloc(unsigned int __nocast gfp_mask, void *pool_data)
+{
+        return kmalloc(sizeof(struct region), gfp_mask);
+}
+static void region_free(void *element, void *pool_data)
+{
+        kfree(element);
+}
+#define MIN_REGIONS 64
+#define MAX_RECOVERY 1
+static int rh_init(struct region_hash *rh, struct mirror_set *ms,
+                   struct dirty_log *log, uint32_t region_size,
+                   region_t nr_regions)
+{
+        unsigned int nr_buckets, max_buckets;
+        size_t i;
+        /*
+         * Calculate a suitable number of buckets for our hash
+         * table.
+         */
+        max_buckets = nr_regions >> 6;
+        for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
+                ;
+        nr_buckets >>= 1;
+        rh->ms = ms;
+        rh->log = log;
+        rh->region_size = region_size;
+        rh->region_shift = ffs(region_size) - 1;
+        rwlock_init(&rh->hash_lock);
+        rh->mask = nr_buckets - 1;
+        rh->nr_buckets = nr_buckets;
+        rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
+        if (!rh->buckets) {
+                DMERR("unable to allocate region hash memory");
+                return -ENOMEM;
+        }
+        for (i = 0; i < nr_buckets; i++)
+                INIT_LIST_HEAD(rh->buckets + i);
+        spin_lock_init(&rh->region_lock);
+        sema_init(&rh->recovery_count, 0);
+        INIT_LIST_HEAD(&rh->clean_regions);
+        INIT_LIST_HEAD(&rh->quiesced_regions);
+        INIT_LIST_HEAD(&rh->recovered_regions);
+        rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
+                                         region_free, NULL);
+        if (!rh->region_pool) {
+                vfree(rh->buckets);
+                rh->buckets = NULL;
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void rh_exit(struct region_hash *rh)
+{
+        unsigned int h;
+        struct region *reg, *nreg;
+        BUG_ON(!list_empty(&rh->quiesced_regions));
+        for (h = 0; h < rh->nr_buckets; h++) {
+                list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) {
+                        BUG_ON(atomic_read(&reg->pending));
+                        mempool_free(reg, rh->region_pool);
+                }
+        }
+        if (rh->log)
+                dm_destroy_dirty_log(rh->log);
+        if (rh->region_pool)
+                mempool_destroy(rh->region_pool);
+        vfree(rh->buckets);
+}
+#define RH_HASH_MULT 2654435387U
+static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
+{
+        return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
+}
+static struct region *__rh_lookup(struct region_hash *rh, region_t region)
+{
+        struct region *reg;
+        list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
+                if (reg->key == region)
+                        return reg;
+        return NULL;
+}
+static void __rh_insert(struct region_hash *rh, struct region *reg)
+{
+        unsigned int h = rh_hash(rh, reg->key);
+        list_add(&reg->hash_list, rh->buckets + h);
+}
+static struct region *__rh_alloc(struct region_hash *rh, region_t region)
+{
+        struct region *reg, *nreg;
+        read_unlock(&rh->hash_lock);
+        nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
+        nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
+                RH_CLEAN : RH_NOSYNC;
+        nreg->rh = rh;
+        nreg->key = region;
+        INIT_LIST_HEAD(&nreg->list);
+        atomic_set(&nreg->pending, 0);
+        bio_list_init(&nreg->delayed_bios);
+        write_lock_irq(&rh->hash_lock);
+        reg = __rh_lookup(rh, region);
+        if (reg)
+                /* we lost the race */
+                mempool_free(nreg, rh->region_pool);
+        else {
+                __rh_insert(rh, nreg);
+                if (nreg->state == RH_CLEAN) {
+                        spin_lock(&rh->region_lock);
+                        list_add(&nreg->list, &rh->clean_regions);
+                        spin_unlock(&rh->region_lock);
+                }
+                reg = nreg;
+        }
+        write_unlock_irq(&rh->hash_lock);
+        read_lock(&rh->hash_lock);
+        return reg;
+}
+static inline struct region *__rh_find(struct region_hash *rh, region_t region)
+{
+        struct region *reg;
+        reg = __rh_lookup(rh, region);
+        if (!reg)
+                reg = __rh_alloc(rh, region);
+        return reg;
+}
+static int rh_state(struct region_hash *rh, region_t region, int may_block)
+{
+        int r;
+        struct region *reg;
+        read_lock(&rh->hash_lock);
+        reg = __rh_lookup(rh, region);
+        read_unlock(&rh->hash_lock);
+        if (reg)
+                return reg->state;
+        /*
+         * The region wasn't in the hash, so we fall back to the
+         * dirty log.
+         */
+        r = rh->log->type->in_sync(rh->log, region, may_block);
+        /*
+         * Any error from the dirty log (eg. -EWOULDBLOCK) gets
+         * taken as a RH_NOSYNC
+         */
+        return r == 1 ? RH_CLEAN : RH_NOSYNC;
+}
+static inline int rh_in_sync(struct region_hash *rh,
+                             region_t region, int may_block)
+{
+        int state = rh_state(rh, region, may_block);
+        return state == RH_CLEAN || state == RH_DIRTY;
+}
+static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list)
+{
+        struct bio *bio;
+        while ((bio = bio_list_pop(bio_list))) {
+                queue_bio(ms, bio, WRITE);
+        }
+}
+static void rh_update_states(struct region_hash *rh)
+{
+        struct region *reg, *next;
+        LIST_HEAD(clean);
+        LIST_HEAD(recovered);
+        /*
+         * Quickly grab the lists.
+         */
+        write_lock_irq(&rh->hash_lock);
+        spin_lock(&rh->region_lock);
+        if (!list_empty(&rh->clean_regions)) {
+                list_splice(&rh->clean_regions, &clean);
+                INIT_LIST_HEAD(&rh->clean_regions);
+                list_for_each_entry (reg, &clean, list) {
+                        rh->log->type->clear_region(rh->log, reg->key);
+                        list_del(&reg->hash_list);
+                }
+        }
+        if (!list_empty(&rh->recovered_regions)) {
+                list_splice(&rh->recovered_regions, &recovered);
+                INIT_LIST_HEAD(&rh->recovered_regions);
+                list_for_each_entry (reg, &recovered, list)
+                        list_del(&reg->hash_list);
+        }
+        spin_unlock(&rh->region_lock);
+        write_unlock_irq(&rh->hash_lock);
+        /*
+         * All the regions on the recovered and clean lists have
+         * now been pulled out of the system, so no need to do
+         * any more locking.
+         */
+        list_for_each_entry_safe (reg, next, &recovered, list) {
+                rh->log->type->clear_region(rh->log, reg->key);
+                rh->log->type->complete_resync_work(rh->log, reg->key, 1);
+                dispatch_bios(rh->ms, &reg->delayed_bios);
+                up(&rh->recovery_count);
+                mempool_free(reg, rh->region_pool);
+        }
+        if (!list_empty(&recovered))
+                rh->log->type->flush(rh->log);
+        list_for_each_entry_safe (reg, next, &clean, list)
+                mempool_free(reg, rh->region_pool);
+}
+static void rh_inc(struct region_hash *rh, region_t region)
+{
+        struct region *reg;
+        read_lock(&rh->hash_lock);
+        reg = __rh_find(rh, region);
+        if (reg->state == RH_CLEAN) {
+                rh->log->type->mark_region(rh->log, reg->key);
+                spin_lock_irq(&rh->region_lock);
+                reg->state = RH_DIRTY;
+                list_del_init(&reg->list);      /* take off the clean list */
+                spin_unlock_irq(&rh->region_lock);
+        }
+        atomic_inc(&reg->pending);
+        read_unlock(&rh->hash_lock);
+}
+static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios)
+{
+        struct bio *bio;
+        for (bio = bios->head; bio; bio = bio->bi_next)
+                rh_inc(rh, bio_to_region(rh, bio));
+}
+static void rh_dec(struct region_hash *rh, region_t region)
+{
+        unsigned long flags;
+        struct region *reg;
+        int should_wake = 0;
+        read_lock(&rh->hash_lock);
+        reg = __rh_lookup(rh, region);
+        read_unlock(&rh->hash_lock);
+        if (atomic_dec_and_test(&reg->pending)) {
+                spin_lock_irqsave(&rh->region_lock, flags);
+                if (reg->state == RH_RECOVERING) {
+                        list_add_tail(&reg->list, &rh->quiesced_regions);
+                } else {
+                        reg->state = RH_CLEAN;
+                        list_add(&reg->list, &rh->clean_regions);
+                }
+                spin_unlock_irqrestore(&rh->region_lock, flags);
+                should_wake = 1;
+        }
+        if (should_wake)
+                wake();
+}
+/*
+ * Starts quiescing a region in preparation for recovery.
+ */
+static int __rh_recovery_prepare(struct region_hash *rh)
+{
+        int r;
+        struct region *reg;
+        region_t region;
+        /*
+         * Ask the dirty log what's next.
+         */
+        r = rh->log->type->get_resync_work(rh->log, &region);
+        if (r <= 0)
+                return r;
+        /*
+         * Get this region, and start it quiescing by setting the
+         * recovering flag.
+         */
+        read_lock(&rh->hash_lock);
+        reg = __rh_find(rh, region);
+        read_unlock(&rh->hash_lock);
+        spin_lock_irq(&rh->region_lock);
+        reg->state = RH_RECOVERING;
+        /* Already quiesced ? */
+        if (atomic_read(&reg->pending))
+                list_del_init(&reg->list);
+        else {
+                list_del_init(&reg->list);
+                list_add(&reg->list, &rh->quiesced_regions);
+        }
+        spin_unlock_irq(&rh->region_lock);
+        return 1;
+}
+static void rh_recovery_prepare(struct region_hash *rh)
+{
+        while (!down_trylock(&rh->recovery_count))
+                if (__rh_recovery_prepare(rh) <= 0) {
+                        up(&rh->recovery_count);
+                        break;
+                }
+}
+/*
+ * Returns any quiesced regions.
+ */
+static struct region *rh_recovery_start(struct region_hash *rh)
+{
+        struct region *reg = NULL;
+        spin_lock_irq(&rh->region_lock);
+        if (!list_empty(&rh->quiesced_regions)) {
+                reg = list_entry(rh->quiesced_regions.next,
+                                 struct region, list);
+                list_del_init(&reg->list);      /* remove from the quiesced list */
+        }
+        spin_unlock_irq(&rh->region_lock);
+        return reg;
+}
+/* FIXME: success ignored for now */
+static void rh_recovery_end(struct region *reg, int success)
+{
+        struct region_hash *rh = reg->rh;
+        spin_lock_irq(&rh->region_lock);
+        list_add(&reg->list, &reg->rh->recovered_regions);
+        spin_unlock_irq(&rh->region_lock);
+        wake();
+}
+static void rh_flush(struct region_hash *rh)
+{
+        rh->log->type->flush(rh->log);
+}
+static void rh_delay(struct region_hash *rh, struct bio *bio)
+{
+        struct region *reg;
+        read_lock(&rh->hash_lock);
+        reg = __rh_find(rh, bio_to_region(rh, bio));
+        bio_list_add(&reg->delayed_bios, bio);
+        read_unlock(&rh->hash_lock);
+}
+static void rh_stop_recovery(struct region_hash *rh)
+{
+        int i;
+        /* wait for any recovering regions */
+        for (i = 0; i < MAX_RECOVERY; i++)
+                down(&rh->recovery_count);
+}
+static void rh_start_recovery(struct region_hash *rh)
+{
+        int i;
+        for (i = 0; i < MAX_RECOVERY; i++)
+                up(&rh->recovery_count);
+        wake();
+}
+/*-----------------------------------------------------------------
+ * Mirror set structures.
+ *---------------------------------------------------------------*/
+struct mirror {
+        atomic_t error_count;
+        struct dm_dev *dev;
+        sector_t offset;
+};
+struct mirror_set {
+        struct dm_target *ti;
+        struct list_head list;
+        struct region_hash rh;
+        struct kcopyd_client *kcopyd_client;
+        spinlock_t lock;        /* protects the next two lists */
+        struct bio_list reads;
+        struct bio_list writes;
+        /* recovery */
+        region_t nr_regions;
+        int in_sync;
+        unsigned int nr_mirrors;
+        struct mirror mirror[0];
+};
+/*
+ * Every mirror should look like this one.
+ */
+#define DEFAULT_MIRROR 0
+/*
+ * This is yucky.  We squirrel the mirror_set struct away inside
+ * bi_next for write buffers.  This is safe since the bh
+ * doesn't get submitted to the lower levels of block layer.
+ */
+static struct mirror_set *bio_get_ms(struct bio *bio)
+{
+        return (struct mirror_set *) bio->bi_next;
+}
+static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
+{
+        bio->bi_next = (struct bio *) ms;
+}
+/*-----------------------------------------------------------------
+ * Recovery.
+ *
+ * When a mirror is first activated we may find that some regions
+ * are in the no-sync state.  We have to recover these by
+ * recopying from the default mirror to all the others.
+ *---------------------------------------------------------------*/
+static void recovery_complete(int read_err, unsigned int write_err,
+                              void *context)
+{
+        struct region *reg = (struct region *) context;
+        /* FIXME: better error handling */
+        rh_recovery_end(reg, read_err || write_err);
+}
+static int recover(struct mirror_set *ms, struct region *reg)
+{
+        int r;
+        unsigned int i;
+        struct io_region from, to[KCOPYD_MAX_REGIONS], *dest;
+        struct mirror *m;
+        unsigned long flags = 0;
+        /* fill in the source */
+        m = ms->mirror + DEFAULT_MIRROR;
+        from.bdev = m->dev->bdev;
+        from.sector = m->offset + region_to_sector(reg->rh, reg->key);
+        if (reg->key == (ms->nr_regions - 1)) {
+                /*
+                 * The final region may be smaller than
+                 * region_size.
+                 */
+                from.count = ms->ti->len & (reg->rh->region_size - 1);
+                if (!from.count)
+                        from.count = reg->rh->region_size;
+        } else
+                from.count = reg->rh->region_size;
+        /* fill in the destinations */
+        for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
+                if (i == DEFAULT_MIRROR)
+                        continue;
+                m = ms->mirror + i;
+                dest->bdev = m->dev->bdev;
+                dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
+                dest->count = from.count;
+                dest++;
+        }
+        /* hand to kcopyd */
+        set_bit(KCOPYD_IGNORE_ERROR, &flags);
+        r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags,
+                        recovery_complete, reg);
+        return r;
+}
+static void do_recovery(struct mirror_set *ms)
+{
+        int r;
+        struct region *reg;
+        struct dirty_log *log = ms->rh.log;
+        /*
+         * Start quiescing some regions.
+         */
+        rh_recovery_prepare(&ms->rh);
+        /*
+         * Copy any already quiesced regions.
+         */
+        while ((reg = rh_recovery_start(&ms->rh))) {
+                r = recover(ms, reg);
+                if (r)
+                        rh_recovery_end(reg, 0);
+        }
+        /*
+         * Update the in sync flag.
+         */
+        if (!ms->in_sync &&
+            (log->type->get_sync_count(log) == ms->nr_regions)) {
+                /* the sync is complete */
+                dm_table_event(ms->ti->table);
+                ms->in_sync = 1;
+        }
+}
+/*-----------------------------------------------------------------
+ * Reads
+ *---------------------------------------------------------------*/
+static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
+{
+        /* FIXME: add read balancing */
+        return ms->mirror + DEFAULT_MIRROR;
+}
+/*
+ * remap a buffer to a particular mirror.
+ */
+static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
+{
+        bio->bi_bdev = m->dev->bdev;
+        bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
+}
+static void do_reads(struct mirror_set *ms, struct bio_list *reads)
+{
+        region_t region;
+        struct bio *bio;
+        struct mirror *m;
+        while ((bio = bio_list_pop(reads))) {
+                region = bio_to_region(&ms->rh, bio);
+                /*
+                 * We can only read balance if the region is in sync.
+                 */
+                if (rh_in_sync(&ms->rh, region, 0))
+                        m = choose_mirror(ms, bio->bi_sector);
+                else
+                        m = ms->mirror + DEFAULT_MIRROR;
+                map_bio(ms, m, bio);
+                generic_make_request(bio);
+        }
+}
+/*-----------------------------------------------------------------
+ * Writes.
+ *
+ * We do different things with the write io depending on the
+ * state of the region that it's in:
+ *
+ * SYNC:        increment pending, use kcopyd to write to *all* mirrors
+ * RECOVERING:  delay the io until recovery completes
+ * NOSYNC:      increment pending, just write to the default mirror
+ *---------------------------------------------------------------*/
+static void write_callback(unsigned long error, void *context)
+{
+        unsigned int i;
+        int uptodate = 1;
+        struct bio *bio = (struct bio *) context;
+        struct mirror_set *ms;
+        ms = bio_get_ms(bio);
+        bio_set_ms(bio, NULL);
+        /*
+         * NOTE: We don't decrement the pending count here,
+         * instead it is done by the targets endio function.
+         * This way we handle both writes to SYNC and NOSYNC
+         * regions with the same code.
+         */
+        if (error) {
+                /*
+                 * only error the io if all mirrors failed.
+                 * FIXME: bogus
+                 */
+                uptodate = 0;
+                for (i = 0; i < ms->nr_mirrors; i++)
+                        if (!test_bit(i, &error)) {
+                                uptodate = 1;
+                                break;
+                        }
+        }
+        bio_endio(bio, bio->bi_size, 0);
+}
+static void do_write(struct mirror_set *ms, struct bio *bio)
+{
+        unsigned int i;
+        struct io_region io[KCOPYD_MAX_REGIONS+1];
+        struct mirror *m;
+        for (i = 0; i < ms->nr_mirrors; i++) {
+                m = ms->mirror + i;
+                io[i].bdev = m->dev->bdev;
+                io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
+                io[i].count = bio->bi_size >> 9;
+        }
+        bio_set_ms(bio, ms);
+        dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
+                         bio->bi_io_vec + bio->bi_idx,
+                         write_callback, bio);
+}
+static void do_writes(struct mirror_set *ms, struct bio_list *writes)
+{
+        int state;
+        struct bio *bio;
+        struct bio_list sync, nosync, recover, *this_list = NULL;
+        if (!writes->head)
+                return;
+        /*
+         * Classify each write.
+         */
+        bio_list_init(&sync);
+        bio_list_init(&nosync);
+        bio_list_init(&recover);
+        while ((bio = bio_list_pop(writes))) {
+                state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
+                switch (state) {
+                case RH_CLEAN:
+                case RH_DIRTY:
+                        this_list = &sync;
+                        break;
+                case RH_NOSYNC:
+                        this_list = &nosync;
+                        break;
+                case RH_RECOVERING:
+                        this_list = &recover;
+                        break;
+                }
+                bio_list_add(this_list, bio);
+        }
+        /*
+         * Increment the pending counts for any regions that will
+         * be written to (writes to recover regions are going to
+         * be delayed).
+         */
+        rh_inc_pending(&ms->rh, &sync);
+        rh_inc_pending(&ms->rh, &nosync);
+        rh_flush(&ms->rh);
+        /*
+         * Dispatch io.
+         */
+        while ((bio = bio_list_pop(&sync)))
+                do_write(ms, bio);
+        while ((bio = bio_list_pop(&recover)))
+                rh_delay(&ms->rh, bio);
+        while ((bio = bio_list_pop(&nosync))) {
+                map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
+                generic_make_request(bio);
+        }
+}
+/*-----------------------------------------------------------------
+ * kmirrord
+ *---------------------------------------------------------------*/
+static LIST_HEAD(_mirror_sets);
+static DECLARE_RWSEM(_mirror_sets_lock);
+static void do_mirror(struct mirror_set *ms)
+{
+        struct bio_list reads, writes;
+        spin_lock(&ms->lock);
+        reads = ms->reads;
+        writes = ms->writes;
+        bio_list_init(&ms->reads);
+        bio_list_init(&ms->writes);
+        spin_unlock(&ms->lock);
+        rh_update_states(&ms->rh);
+        do_recovery(ms);
+        do_reads(ms, &reads);
+        do_writes(ms, &writes);
+}
+static void do_work(void *ignored)
+{
+        struct mirror_set *ms;
+        down_read(&_mirror_sets_lock);
+        list_for_each_entry (ms, &_mirror_sets, list)
+                do_mirror(ms);
+        up_read(&_mirror_sets_lock);
+}
+/*-----------------------------------------------------------------
+ * Target functions
+ *---------------------------------------------------------------*/
+static struct mirror_set *alloc_context(unsigned int nr_mirrors,
+                                        uint32_t region_size,
+                                        struct dm_target *ti,
+                                        struct dirty_log *dl)
+{
+        size_t len;
+        struct mirror_set *ms = NULL;
+        if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
+                return NULL;
+        len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
+        ms = kmalloc(len, GFP_KERNEL);
+        if (!ms) {
+                ti->error = "dm-mirror: Cannot allocate mirror context";
+                return NULL;
+        }
+        memset(ms, 0, len);
+        spin_lock_init(&ms->lock);
+        ms->ti = ti;
+        ms->nr_mirrors = nr_mirrors;
+        ms->nr_regions = dm_sector_div_up(ti->len, region_size);
+        ms->in_sync = 0;
+        if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
+                ti->error = "dm-mirror: Error creating dirty region hash";
+                kfree(ms);
+                return NULL;
+        }
+        return ms;
+}
+static void free_context(struct mirror_set *ms, struct dm_target *ti,
+                         unsigned int m)
+{
+        while (m--)
+                dm_put_device(ti, ms->mirror[m].dev);
+        rh_exit(&ms->rh);
+        kfree(ms);
+}
+static inline int _check_region_size(struct dm_target *ti, uint32_t size)
+{
+        return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
+                 size > ti->len);
+}
+static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
+                      unsigned int mirror, char **argv)
+{
+        sector_t offset;
+        if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
+                ti->error = "dm-mirror: Invalid offset";
+                return -EINVAL;
+        }
+        if (dm_get_device(ti, argv[0], offset, ti->len,
+                          dm_table_get_mode(ti->table),
+                          &ms->mirror[mirror].dev)) {
+                ti->error = "dm-mirror: Device lookup failure";
+                return -ENXIO;
+        }
+        ms->mirror[mirror].offset = offset;
+        return 0;
+}
+static int add_mirror_set(struct mirror_set *ms)
+{
+        down_write(&_mirror_sets_lock);
+        list_add_tail(&ms->list, &_mirror_sets);
+        up_write(&_mirror_sets_lock);
+        wake();
+        return 0;
+}
+static void del_mirror_set(struct mirror_set *ms)
+{
+        down_write(&_mirror_sets_lock);
+        list_del(&ms->list);
+        up_write(&_mirror_sets_lock);
+}
+/*
+ * Create dirty log: log_type #log_params <log_params>
+ */
+static struct dirty_log *create_dirty_log(struct dm_target *ti,
+                                          unsigned int argc, char **argv,
+                                          unsigned int *args_used)
+{
+        unsigned int param_count;
+        struct dirty_log *dl;
+        if (argc < 2) {
+                ti->error = "dm-mirror: Insufficient mirror log arguments";
+                return NULL;
+        }
+        if (sscanf(argv[1], "%u", &param_count) != 1) {
+                ti->error = "dm-mirror: Invalid mirror log argument count";
+                return NULL;
+        }
+        *args_used = 2 + param_count;
+        if (argc < *args_used) {
+                ti->error = "dm-mirror: Insufficient mirror log arguments";
+                return NULL;
+        }
+        dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2);
+        if (!dl) {
+                ti->error = "dm-mirror: Error creating mirror dirty log";
+                return NULL;
+        }
+        if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
+                ti->error = "dm-mirror: Invalid region size";
+                dm_destroy_dirty_log(dl);
+                return NULL;
+        }
+        return dl;
+}
+/*
+ * Construct a mirror mapping:
+ *
+ * log_type #log_params <log_params>
+ * #mirrors [mirror_path offset]{2,}
+ *
+ * log_type is "core" or "disk"
+ * #log_params is between 1 and 3
+ */
+#define DM_IO_PAGES 64
+static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+        int r;
+        unsigned int nr_mirrors, m, args_used;
+        struct mirror_set *ms;
+        struct dirty_log *dl;
+        dl = create_dirty_log(ti, argc, argv, &args_used);
+        if (!dl)
+                return -EINVAL;
+        argv += args_used;
+        argc -= args_used;
+        if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
+            nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) {
+                ti->error = "dm-mirror: Invalid number of mirrors";
+                dm_destroy_dirty_log(dl);
+                return -EINVAL;
+        }
+        argv++, argc--;
+        if (argc != nr_mirrors * 2) {
+                ti->error = "dm-mirror: Wrong number of mirror arguments";
+                dm_destroy_dirty_log(dl);
+                return -EINVAL;
+        }
+        ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
+        if (!ms) {
+                dm_destroy_dirty_log(dl);
+                return -ENOMEM;
+        }
+        /* Get the mirror parameter sets */
+        for (m = 0; m < nr_mirrors; m++) {
+                r = get_mirror(ms, ti, m, argv);
+                if (r) {
+                        free_context(ms, ti, m);
+                        return r;
+                }
+                argv += 2;
+                argc -= 2;
+        }
+        ti->private = ms;
+        r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
+        if (r) {
+                free_context(ms, ti, ms->nr_mirrors);
+                return r;
+        }
+        add_mirror_set(ms);
+        return 0;
+}
+static void mirror_dtr(struct dm_target *ti)
+{
+        struct mirror_set *ms = (struct mirror_set *) ti->private;
+        del_mirror_set(ms);
+        kcopyd_client_destroy(ms->kcopyd_client);
+        free_context(ms, ti, ms->nr_mirrors);
+}
+static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
+{
+        int should_wake = 0;
+        struct bio_list *bl;
+        bl = (rw == WRITE) ? &ms->writes : &ms->reads;
+        spin_lock(&ms->lock);
+        should_wake = !(bl->head);
+        bio_list_add(bl, bio);
+        spin_unlock(&ms->lock);
+        if (should_wake)
+                wake();
+}
+/*
+ * Mirror mapping function
+ */
+static int mirror_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
+{
+        int r, rw = bio_rw(bio);
+        struct mirror *m;
+        struct mirror_set *ms = ti->private;
+        map_context->ll = bio->bi_sector >> ms->rh.region_shift;
+        if (rw == WRITE) {
+                queue_bio(ms, bio, rw);
+                return 0;
+        }
+        r = ms->rh.log->type->in_sync(ms->rh.log,
+                                      bio_to_region(&ms->rh, bio), 0);
+        if (r < 0 && r != -EWOULDBLOCK)
+                return r;
+        if (r == -EWOULDBLOCK)  /* FIXME: ugly */
+                r = 0;
+        /*
+         * We don't want to fast track a recovery just for a read
+         * ahead.  So we just let it silently fail.
+         * FIXME: get rid of this.
+         */
+        if (!r && rw == READA)
+                return -EIO;
+        if (!r) {
+                /* Pass this io over to the daemon */
+                queue_bio(ms, bio, rw);
+                return 0;
+        }
+        m = choose_mirror(ms, bio->bi_sector);
+        if (!m)
+                return -EIO;
+        map_bio(ms, m, bio);
+        return 1;
+}
+static int mirror_end_io(struct dm_target *ti, struct bio *bio,
+                         int error, union map_info *map_context)
+{
+        int rw = bio_rw(bio);
+        struct mirror_set *ms = (struct mirror_set *) ti->private;
+        region_t region = map_context->ll;
+        /*
+         * We need to dec pending if this was a write.
+         */
+        if (rw == WRITE)
+                rh_dec(&ms->rh, region);
+        return 0;
+}
+static void mirror_postsuspend(struct dm_target *ti)
+{
+        struct mirror_set *ms = (struct mirror_set *) ti->private;
+        struct dirty_log *log = ms->rh.log;
+        rh_stop_recovery(&ms->rh);
+        if (log->type->suspend && log->type->suspend(log))
+                /* FIXME: need better error handling */
+                DMWARN("log suspend failed");
+}
+static void mirror_resume(struct dm_target *ti)
+{
+        struct mirror_set *ms = (struct mirror_set *) ti->private;
+        struct dirty_log *log = ms->rh.log;
+        if (log->type->resume && log->type->resume(log))
+                /* FIXME: need better error handling */
+                DMWARN("log resume failed");
+        rh_start_recovery(&ms->rh);
+}
+static int mirror_status(struct dm_target *ti, status_type_t type,
+                         char *result, unsigned int maxlen)
+{
+        unsigned int m, sz;
+        struct mirror_set *ms = (struct mirror_set *) ti->private;
+        sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
+        switch (type) {
+        case STATUSTYPE_INFO:
+                DMEMIT("%d ", ms->nr_mirrors);
+                for (m = 0; m < ms->nr_mirrors; m++)
+                        DMEMIT("%s ", ms->mirror[m].dev->name);
+                DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT,
+                       ms->rh.log->type->get_sync_count(ms->rh.log),
+                       ms->nr_regions);
+                break;
+        case STATUSTYPE_TABLE:
+                DMEMIT("%d ", ms->nr_mirrors);
+                for (m = 0; m < ms->nr_mirrors; m++)
+                        DMEMIT("%s " SECTOR_FORMAT " ",
+                               ms->mirror[m].dev->name, ms->mirror[m].offset);
+        }
+        return 0;
+}
+static struct target_type mirror_target = {
+        .name    = "mirror",
+        .version = {1, 0, 1},
+        .module  = THIS_MODULE,
+        .ctr     = mirror_ctr,
+        .dtr     = mirror_dtr,
+        .map     = mirror_map,
+        .end_io  = mirror_end_io,
+        .postsuspend = mirror_postsuspend,
+        .resume  = mirror_resume,
+        .status  = mirror_status,
+};
+static int __init dm_mirror_init(void)
+{
+        int r;
+        r = dm_dirty_log_init();
+        if (r)
+                return r;
+        _kmirrord_wq = create_workqueue("kmirrord");
+        if (!_kmirrord_wq) {
+                DMERR("couldn't start kmirrord");
+                dm_dirty_log_exit();
+                return r;
+        }
+        INIT_WORK(&_kmirrord_work, do_work, NULL);
+        r = dm_register_target(&mirror_target);
+        if (r < 0) {
+                DMERR("%s: Failed to register mirror target",
+                      mirror_target.name);
+                dm_dirty_log_exit();
+                destroy_workqueue(_kmirrord_wq);
+        }
+        return r;
+}
+static void __exit dm_mirror_exit(void)
+{
+        int r;
+        r = dm_unregister_target(&mirror_target);
+        if (r < 0)
+                DMERR("%s: unregister failed %d", mirror_target.name, r);
+        destroy_workqueue(_kmirrord_wq);
+        dm_dirty_log_exit();
+}
+/* Module hooks */
+module_init(dm_mirror_init);
+module_exit(dm_mirror_exit);
+MODULE_DESCRIPTION(DM_NAME " mirror target");
+MODULE_AUTHOR("Joe Thornber");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
new file mode 100644
index 000000000000..d0024865a789
--- /dev/null
+++ b/drivers/md/dm-round-robin.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Round-robin path selector.
+ */
+#include "dm.h"
+#include "dm-path-selector.h"
+#include <linux/slab.h>
+/*-----------------------------------------------------------------
+ * Path-handling code, paths are held in lists
+ *---------------------------------------------------------------*/
+struct path_info {
+        struct list_head list;
+        struct path *path;
+        unsigned repeat_count;
+};
+static void free_paths(struct list_head *paths)
+{
+        struct path_info *pi, *next;
+        list_for_each_entry_safe(pi, next, paths, list) {
+                list_del(&pi->list);
+                kfree(pi);
+        }
+}
+/*-----------------------------------------------------------------
+ * Round-robin selector
+ *---------------------------------------------------------------*/
+#define RR_MIN_IO               1000
+struct selector {
+        struct list_head valid_paths;
+        struct list_head invalid_paths;
+};
+static struct selector *alloc_selector(void)
+{
+        struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (s) {
+                INIT_LIST_HEAD(&s->valid_paths);
+                INIT_LIST_HEAD(&s->invalid_paths);
+        }
+        return s;
+}
+static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+        struct selector *s;
+        s = alloc_selector();
+        if (!s)
+                return -ENOMEM;
+        ps->context = s;
+        return 0;
+}
+static void rr_destroy(struct path_selector *ps)
+{
+        struct selector *s = (struct selector *) ps->context;
+        free_paths(&s->valid_paths);
+        free_paths(&s->invalid_paths);
+        kfree(s);
+        ps->context = NULL;
+}
+static int rr_status(struct path_selector *ps, struct path *path,
+                     status_type_t type, char *result, unsigned int maxlen)
+{
+        struct path_info *pi;
+        int sz = 0;
+        if (!path)
+                DMEMIT("0 ");
+        else {
+                switch(type) {
+                case STATUSTYPE_INFO:
+                        break;
+                case STATUSTYPE_TABLE:
+                        pi = path->pscontext;
+                        DMEMIT("%u ", pi->repeat_count);
+                        break;
+                }
+        }
+        return sz;
+}
+/*
+ * Called during initialisation to register each path with an
+ * optional repeat_count.
+ */
+static int rr_add_path(struct path_selector *ps, struct path *path,
+                       int argc, char **argv, char **error)
+{
+        struct selector *s = (struct selector *) ps->context;
+        struct path_info *pi;
+        unsigned repeat_count = RR_MIN_IO;
+        if (argc > 1) {
+                *error = "round-robin ps: incorrect number of arguments";
+                return -EINVAL;
+        }
+        /* First path argument is number of I/Os before switching path */
+        if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+                *error = "round-robin ps: invalid repeat count";
+                return -EINVAL;
+        }
+        /* allocate the path */
+        pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+        if (!pi) {
+                *error = "round-robin ps: Error allocating path context";
+                return -ENOMEM;
+        }
+        pi->path = path;
+        pi->repeat_count = repeat_count;
+        path->pscontext = pi;
+        list_add(&pi->list, &s->valid_paths);
+        return 0;
+}
+static void rr_fail_path(struct path_selector *ps, struct path *p)
+{
+        struct selector *s = (struct selector *) ps->context;
+        struct path_info *pi = p->pscontext;
+        list_move(&pi->list, &s->invalid_paths);
+}
+static int rr_reinstate_path(struct path_selector *ps, struct path *p)
+{
+        struct selector *s = (struct selector *) ps->context;
+        struct path_info *pi = p->pscontext;
+        list_move(&pi->list, &s->valid_paths);
+        return 0;
+}
+static struct path *rr_select_path(struct path_selector *ps,
+                                   unsigned *repeat_count)
+{
+        struct selector *s = (struct selector *) ps->context;
+        struct path_info *pi = NULL;
+        if (!list_empty(&s->valid_paths)) {
+                pi = list_entry(s->valid_paths.next, struct path_info, list);
+                list_move_tail(&pi->list, &s->valid_paths);
+                *repeat_count = pi->repeat_count;
+        }
+        return pi ? pi->path : NULL;
+}
+static struct path_selector_type rr_ps = {
+        .name = "round-robin",
+        .module = THIS_MODULE,
+        .table_args = 1,
+        .info_args = 0,
+        .create = rr_create,
+        .destroy = rr_destroy,
+        .status = rr_status,
+        .add_path = rr_add_path,
+        .fail_path = rr_fail_path,
+        .reinstate_path = rr_reinstate_path,
+        .select_path = rr_select_path,
+};
+static int __init dm_rr_init(void)
+{
+        int r = dm_register_path_selector(&rr_ps);
+        if (r < 0)
+                DMERR("round-robin: register failed %d", r);
+        DMINFO("dm-round-robin version 1.0.0 loaded");
+        return r;
+}
+static void __exit dm_rr_exit(void)
+{
+        int r = dm_unregister_path_selector(&rr_ps);
+        if (r < 0)
+                DMERR("round-robin: unregister failed %d", r);
+}
+module_init(dm_rr_init);
+module_exit(dm_rr_exit);
+MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector");
+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
new file mode 100644
index 000000000000..7e691ab9a748
--- /dev/null
+++ b/drivers/md/dm-snap.c
@@ -0,0 +1,1208 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/blkdev.h>
+#include <linux/config.h>
+#include <linux/ctype.h>
+#include <linux/device-mapper.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kdev_t.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "dm-snap.h"
+#include "dm-bio-list.h"
+#include "kcopyd.h"
+/*
+ * The percentage increment we will wake up users at
+ */
+#define WAKE_UP_PERCENT 5
+/*
+ * kcopyd priority of snapshot operations
+ */
+#define SNAPSHOT_COPY_PRIORITY 2
+/*
+ * Each snapshot reserves this many pages for io
+ */
+#define SNAPSHOT_PAGES 256
+struct pending_exception {
+        struct exception e;
+        /*
+         * Origin buffers waiting for this to complete are held
+         * in a bio list
+         */
+        struct bio_list origin_bios;
+        struct bio_list snapshot_bios;
+        /*
+         * Other pending_exceptions that are processing this
+         * chunk.  When this list is empty, we know we can
+         * complete the origins.
+         */
+        struct list_head siblings;
+        /* Pointer back to snapshot context */
+        struct dm_snapshot *snap;
+        /*
+         * 1 indicates the exception has already been sent to
+         * kcopyd.
+         */
+        int started;
+};
+/*
+ * Hash table mapping origin volumes to lists of snapshots and
+ * a lock to protect it
+ */
+static kmem_cache_t *exception_cache;
+static kmem_cache_t *pending_cache;
+static mempool_t *pending_pool;
+/*
+ * One of these per registered origin, held in the snapshot_origins hash
+ */
+struct origin {
+        /* The origin device */
+        struct block_device *bdev;
+        struct list_head hash_list;
+        /* List of snapshots for this origin */
+        struct list_head snapshots;
+};
+/*
+ * Size of the hash table for origin volumes. If we make this
+ * the size of the minors list then it should be nearly perfect
+ */
+#define ORIGIN_HASH_SIZE 256
+#define ORIGIN_MASK      0xFF
+static struct list_head *_origins;
+static struct rw_semaphore _origins_lock;
+static int init_origin_hash(void)
+{
+        int i;
+        _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
+                           GFP_KERNEL);
+        if (!_origins) {
+                DMERR("Device mapper: Snapshot: unable to allocate memory");
+                return -ENOMEM;
+        }
+        for (i = 0; i < ORIGIN_HASH_SIZE; i++)
+                INIT_LIST_HEAD(_origins + i);
+        init_rwsem(&_origins_lock);
+        return 0;
+}
+static void exit_origin_hash(void)
+{
+        kfree(_origins);
+}
+static inline unsigned int origin_hash(struct block_device *bdev)
+{
+        return bdev->bd_dev & ORIGIN_MASK;
+}
+static struct origin *__lookup_origin(struct block_device *origin)
+{
+        struct list_head *ol;
+        struct origin *o;
+        ol = &_origins[origin_hash(origin)];
+        list_for_each_entry (o, ol, hash_list)
+                if (bdev_equal(o->bdev, origin))
+                        return o;
+        return NULL;
+}
+static void __insert_origin(struct origin *o)
+{
+        struct list_head *sl = &_origins[origin_hash(o->bdev)];
+        list_add_tail(&o->hash_list, sl);
+}
+/*
+ * Make a note of the snapshot and its origin so we can look it
+ * up when the origin has a write on it.
+ */
+static int register_snapshot(struct dm_snapshot *snap)
+{
+        struct origin *o;
+        struct block_device *bdev = snap->origin->bdev;
+        down_write(&_origins_lock);
+        o = __lookup_origin(bdev);
+        if (!o) {
+                /* New origin */
+                o = kmalloc(sizeof(*o), GFP_KERNEL);
+                if (!o) {
+                        up_write(&_origins_lock);
+                        return -ENOMEM;
+                }
+                /* Initialise the struct */
+                INIT_LIST_HEAD(&o->snapshots);
+                o->bdev = bdev;
+                __insert_origin(o);
+        }
+        list_add_tail(&snap->list, &o->snapshots);
+        up_write(&_origins_lock);
+        return 0;
+}
+static void unregister_snapshot(struct dm_snapshot *s)
+{
+        struct origin *o;
+        down_write(&_origins_lock);
+        o = __lookup_origin(s->origin->bdev);
+        list_del(&s->list);
+        if (list_empty(&o->snapshots)) {
+                list_del(&o->hash_list);
+                kfree(o);
+        }
+        up_write(&_origins_lock);
+}
+/*
+ * Implementation of the exception hash tables.
+ */
+static int init_exception_table(struct exception_table *et, uint32_t size)
+{
+        unsigned int i;
+        et->hash_mask = size - 1;
+        et->table = dm_vcalloc(size, sizeof(struct list_head));
+        if (!et->table)
+                return -ENOMEM;
+        for (i = 0; i < size; i++)
+                INIT_LIST_HEAD(et->table + i);
+        return 0;
+}
+static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
+{
+        struct list_head *slot;
+        struct exception *ex, *next;
+        int i, size;
+        size = et->hash_mask + 1;
+        for (i = 0; i < size; i++) {
+                slot = et->table + i;
+                list_for_each_entry_safe (ex, next, slot, hash_list)
+                        kmem_cache_free(mem, ex);
+        }
+        vfree(et->table);
+}
+static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
+{
+        return chunk & et->hash_mask;
+}
+static void insert_exception(struct exception_table *eh, struct exception *e)
+{
+        struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
+        list_add(&e->hash_list, l);
+}
+static inline void remove_exception(struct exception *e)
+{
+        list_del(&e->hash_list);
+}
+/*
+ * Return the exception data for a sector, or NULL if not
+ * remapped.
+ */
+static struct exception *lookup_exception(struct exception_table *et,
+                                          chunk_t chunk)
+{
+        struct list_head *slot;
+        struct exception *e;
+        slot = &et->table[exception_hash(et, chunk)];
+        list_for_each_entry (e, slot, hash_list)
+                if (e->old_chunk == chunk)
+                        return e;
+        return NULL;
+}
+static inline struct exception *alloc_exception(void)
+{
+        struct exception *e;
+        e = kmem_cache_alloc(exception_cache, GFP_NOIO);
+        if (!e)
+                e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
+        return e;
+}
+static inline void free_exception(struct exception *e)
+{
+        kmem_cache_free(exception_cache, e);
+}
+static inline struct pending_exception *alloc_pending_exception(void)
+{
+        return mempool_alloc(pending_pool, GFP_NOIO);
+}
+static inline void free_pending_exception(struct pending_exception *pe)
+{
+        mempool_free(pe, pending_pool);
+}
+int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
+{
+        struct exception *e;
+        e = alloc_exception();
+        if (!e)
+                return -ENOMEM;
+        e->old_chunk = old;
+        e->new_chunk = new;
+        insert_exception(&s->complete, e);
+        return 0;
+}
+/*
+ * Hard coded magic.
+ */
+static int calc_max_buckets(void)
+{
+        /* use a fixed size of 2MB */
+        unsigned long mem = 2 * 1024 * 1024;
+        mem /= sizeof(struct list_head);
+        return mem;
+}
+/*
+ * Rounds a number down to a power of 2.
+ */
+static inline uint32_t round_down(uint32_t n)
+{
+        while (n & (n - 1))
+                n &= (n - 1);
+        return n;
+}
+/*
+ * Allocate room for a suitable hash table.
+ */
+static int init_hash_tables(struct dm_snapshot *s)
+{
+        sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
+        /*
+         * Calculate based on the size of the original volume or
+         * the COW volume...
+         */
+        cow_dev_size = get_dev_size(s->cow->bdev);
+        origin_dev_size = get_dev_size(s->origin->bdev);
+        max_buckets = calc_max_buckets();
+        hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
+        hash_size = min(hash_size, max_buckets);
+        /* Round it down to a power of 2 */
+        hash_size = round_down(hash_size);
+        if (init_exception_table(&s->complete, hash_size))
+                return -ENOMEM;
+        /*
+         * Allocate hash table for in-flight exceptions
+         * Make this smaller than the real hash table
+         */
+        hash_size >>= 3;
+        if (hash_size < 64)
+                hash_size = 64;
+        if (init_exception_table(&s->pending, hash_size)) {
+                exit_exception_table(&s->complete, exception_cache);
+                return -ENOMEM;
+        }
+        return 0;
+}
+/*
+ * Round a number up to the nearest 'size' boundary.  size must
+ * be a power of 2.
+ */
+static inline ulong round_up(ulong n, ulong size)
+{
+        size--;
+        return (n + size) & ~size;
+}
+/*
+ * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
+ */
+static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+        struct dm_snapshot *s;
+        unsigned long chunk_size;
+        int r = -EINVAL;
+        char persistent;
+        char *origin_path;
+        char *cow_path;
+        char *value;
+        int blocksize;
+        if (argc < 4) {
+                ti->error = "dm-snapshot: requires exactly 4 arguments";
+                r = -EINVAL;
+                goto bad1;
+        }
+        origin_path = argv[0];
+        cow_path = argv[1];
+        persistent = toupper(*argv[2]);
+        if (persistent != 'P' && persistent != 'N') {
+                ti->error = "Persistent flag is not P or N";
+                r = -EINVAL;
+                goto bad1;
+        }
+        chunk_size = simple_strtoul(argv[3], &value, 10);
+        if (chunk_size == 0 || value == NULL) {
+                ti->error = "Invalid chunk size";
+                r = -EINVAL;
+                goto bad1;
+        }
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (s == NULL) {
+                ti->error = "Cannot allocate snapshot context private "
+                    "structure";
+                r = -ENOMEM;
+                goto bad1;
+        }
+        r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
+        if (r) {
+                ti->error = "Cannot get origin device";
+                goto bad2;
+        }
+        r = dm_get_device(ti, cow_path, 0, 0,
+                          FMODE_READ | FMODE_WRITE, &s->cow);
+        if (r) {
+                dm_put_device(ti, s->origin);
+                ti->error = "Cannot get COW device";
+                goto bad2;
+        }
+        /*
+         * Chunk size must be multiple of page size.  Silently
+         * round up if it's not.
+         */
+        chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
+        /* Validate the chunk size against the device block size */
+        blocksize = s->cow->bdev->bd_disk->queue->hardsect_size;
+        if (chunk_size % (blocksize >> 9)) {
+                ti->error = "Chunk size is not a multiple of device blocksize";
+                r = -EINVAL;
+                goto bad3;
+        }
+        /* Check chunk_size is a power of 2 */
+        if (chunk_size & (chunk_size - 1)) {
+                ti->error = "Chunk size is not a power of 2";
+                r = -EINVAL;
+                goto bad3;
+        }
+        s->chunk_size = chunk_size;
+        s->chunk_mask = chunk_size - 1;
+        s->type = persistent;
+        s->chunk_shift = ffs(chunk_size) - 1;
+        s->valid = 1;
+        s->have_metadata = 0;
+        s->last_percent = 0;
+        init_rwsem(&s->lock);
+        s->table = ti->table;
+        /* Allocate hash table for COW data */
+        if (init_hash_tables(s)) {
+                ti->error = "Unable to allocate hash table space";
+                r = -ENOMEM;
+                goto bad3;
+        }
+        /*
+         * Check the persistent flag - done here because we need the iobuf
+         * to check the LV header
+         */
+        s->store.snap = s;
+        if (persistent == 'P')
+                r = dm_create_persistent(&s->store, chunk_size);
+        else
+                r = dm_create_transient(&s->store, s, blocksize);
+        if (r) {
+                ti->error = "Couldn't create exception store";
+                r = -EINVAL;
+                goto bad4;
+        }
+        r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
+        if (r) {
+                ti->error = "Could not create kcopyd client";
+                goto bad5;
+        }
+        /* Add snapshot to the list of snapshots for this origin */
+        if (register_snapshot(s)) {
+                r = -EINVAL;
+                ti->error = "Cannot register snapshot origin";
+                goto bad6;
+        }
+        ti->private = s;
+        ti->split_io = chunk_size;
+        return 0;
+ bad6:
+        kcopyd_client_destroy(s->kcopyd_client);
+ bad5:
+        s->store.destroy(&s->store);
+ bad4:
+        exit_exception_table(&s->pending, pending_cache);
+        exit_exception_table(&s->complete, exception_cache);
+ bad3:
+        dm_put_device(ti, s->cow);
+        dm_put_device(ti, s->origin);
+ bad2:
+        kfree(s);
+ bad1:
+        return r;
+}
+static void snapshot_dtr(struct dm_target *ti)
+{
+        struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+        unregister_snapshot(s);
+        exit_exception_table(&s->pending, pending_cache);
+        exit_exception_table(&s->complete, exception_cache);
+        /* Deallocate memory used */
+        s->store.destroy(&s->store);
+        dm_put_device(ti, s->origin);
+        dm_put_device(ti, s->cow);
+        kcopyd_client_destroy(s->kcopyd_client);
+        kfree(s);
+}
+/*
+ * Flush a list of buffers.
+ */
+static void flush_bios(struct bio *bio)
+{
+        struct bio *n;
+        while (bio) {
+                n = bio->bi_next;
+                bio->bi_next = NULL;
+                generic_make_request(bio);
+                bio = n;
+        }
+}
+/*
+ * Error a list of buffers.
+ */
+static void error_bios(struct bio *bio)
+{
+        struct bio *n;
+        while (bio) {
+                n = bio->bi_next;
+                bio->bi_next = NULL;
+                bio_io_error(bio, bio->bi_size);
+                bio = n;
+        }
+}
+static struct bio *__flush_bios(struct pending_exception *pe)
+{
+        struct pending_exception *sibling;
+        if (list_empty(&pe->siblings))
+                return bio_list_get(&pe->origin_bios);
+        sibling = list_entry(pe->siblings.next,
+                             struct pending_exception, siblings);
+        list_del(&pe->siblings);
+        /* This is fine as long as kcopyd is single-threaded. If kcopyd
+         * becomes multi-threaded, we'll need some locking here.
+         */
+        bio_list_merge(&sibling->origin_bios, &pe->origin_bios);
+        return NULL;
+}
+static void pending_complete(struct pending_exception *pe, int success)
+{
+        struct exception *e;
+        struct dm_snapshot *s = pe->snap;
+        struct bio *flush = NULL;
+        if (success) {
+                e = alloc_exception();
+                if (!e) {
+                        DMWARN("Unable to allocate exception.");
+                        down_write(&s->lock);
+                        s->store.drop_snapshot(&s->store);
+                        s->valid = 0;
+                        flush = __flush_bios(pe);
+                        up_write(&s->lock);
+                        error_bios(bio_list_get(&pe->snapshot_bios));
+                        goto out;
+                }
+                *e = pe->e;
+                /*
+                 * Add a proper exception, and remove the
+                 * in-flight exception from the list.
+                 */
+                down_write(&s->lock);
+                insert_exception(&s->complete, e);
+                remove_exception(&pe->e);
+                flush = __flush_bios(pe);
+                /* Submit any pending write bios */
+                up_write(&s->lock);
+                flush_bios(bio_list_get(&pe->snapshot_bios));
+        } else {
+                /* Read/write error - snapshot is unusable */
+                down_write(&s->lock);
+                if (s->valid)
+                        DMERR("Error reading/writing snapshot");
+                s->store.drop_snapshot(&s->store);
+                s->valid = 0;
+                remove_exception(&pe->e);
+                flush = __flush_bios(pe);
+                up_write(&s->lock);
+                error_bios(bio_list_get(&pe->snapshot_bios));
+                dm_table_event(s->table);
+        }
+ out:
+        free_pending_exception(pe);
+        if (flush)
+                flush_bios(flush);
+}
+static void commit_callback(void *context, int success)
+{
+        struct pending_exception *pe = (struct pending_exception *) context;
+        pending_complete(pe, success);
+}
+/*
+ * Called when the copy I/O has finished.  kcopyd actually runs
+ * this code so don't block.
+ */
+static void copy_callback(int read_err, unsigned int write_err, void *context)
+{
+        struct pending_exception *pe = (struct pending_exception *) context;
+        struct dm_snapshot *s = pe->snap;
+        if (read_err || write_err)
+                pending_complete(pe, 0);
+        else
+                /* Update the metadata if we are persistent */
+                s->store.commit_exception(&s->store, &pe->e, commit_callback,
+                                          pe);
+}
+/*
+ * Dispatches the copy operation to kcopyd.
+ */
+static inline void start_copy(struct pending_exception *pe)
+{
+        struct dm_snapshot *s = pe->snap;
+        struct io_region src, dest;
+        struct block_device *bdev = s->origin->bdev;
+        sector_t dev_size;
+        dev_size = get_dev_size(bdev);
+        src.bdev = bdev;
+        src.sector = chunk_to_sector(s, pe->e.old_chunk);
+        src.count = min(s->chunk_size, dev_size - src.sector);
+        dest.bdev = s->cow->bdev;
+        dest.sector = chunk_to_sector(s, pe->e.new_chunk);
+        dest.count = src.count;
+        /* Hand over to kcopyd */
+        kcopyd_copy(s->kcopyd_client,
+                    &src, 1, &dest, 0, copy_callback, pe);
+}
+/*
+ * Looks to see if this snapshot already has a pending exception
+ * for this chunk, otherwise it allocates a new one and inserts
+ * it into the pending table.
+ *
+ * NOTE: a write lock must be held on snap->lock before calling
+ * this.
+ */
+static struct pending_exception *
+__find_pending_exception(struct dm_snapshot *s, struct bio *bio)
+{
+        struct exception *e;
+        struct pending_exception *pe;
+        chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
+        /*
+         * Is there a pending exception for this already ?
+         */
+        e = lookup_exception(&s->pending, chunk);
+        if (e) {
+                /* cast the exception to a pending exception */
+                pe = container_of(e, struct pending_exception, e);
+        } else {
+                /*
+                 * Create a new pending exception, we don't want
+                 * to hold the lock while we do this.
+                 */
+                up_write(&s->lock);
+                pe = alloc_pending_exception();
+                down_write(&s->lock);
+                e = lookup_exception(&s->pending, chunk);
+                if (e) {
+                        free_pending_exception(pe);
+                        pe = container_of(e, struct pending_exception, e);
+                } else {
+                        pe->e.old_chunk = chunk;
+                        bio_list_init(&pe->origin_bios);
+                        bio_list_init(&pe->snapshot_bios);
+                        INIT_LIST_HEAD(&pe->siblings);
+                        pe->snap = s;
+                        pe->started = 0;
+                        if (s->store.prepare_exception(&s->store, &pe->e)) {
+                                free_pending_exception(pe);
+                                s->valid = 0;
+                                return NULL;
+                        }
+                        insert_exception(&s->pending, &pe->e);
+                }
+        }
+        return pe;
+}
+static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
+                                   struct bio *bio)
+{
+        bio->bi_bdev = s->cow->bdev;
+        bio->bi_sector = chunk_to_sector(s, e->new_chunk) +
+                (bio->bi_sector & s->chunk_mask);
+}
+static int snapshot_map(struct dm_target *ti, struct bio *bio,
+                        union map_info *map_context)
+{
+        struct exception *e;
+        struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+        int r = 1;
+        chunk_t chunk;
+        struct pending_exception *pe;
+        chunk = sector_to_chunk(s, bio->bi_sector);
+        /* Full snapshots are not usable */
+        if (!s->valid)
+                return -1;
+        /*
+         * Write to snapshot - higher level takes care of RW/RO
+         * flags so we should only get this if we are
+         * writeable.
+         */
+        if (bio_rw(bio) == WRITE) {
+                /* FIXME: should only take write lock if we need
+                 * to copy an exception */
+                down_write(&s->lock);
+                /* If the block is already remapped - use that, else remap it */
+                e = lookup_exception(&s->complete, chunk);
+                if (e) {
+                        remap_exception(s, e, bio);
+                        up_write(&s->lock);
+                } else {
+                        pe = __find_pending_exception(s, bio);
+                        if (!pe) {
+                                if (s->store.drop_snapshot)
+                                        s->store.drop_snapshot(&s->store);
+                                s->valid = 0;
+                                r = -EIO;
+                                up_write(&s->lock);
+                        } else {
+                                remap_exception(s, &pe->e, bio);
+                                bio_list_add(&pe->snapshot_bios, bio);
+                                if (!pe->started) {
+                                        /* this is protected by snap->lock */
+                                        pe->started = 1;
+                                        up_write(&s->lock);
+                                        start_copy(pe);
+                                } else
+                                        up_write(&s->lock);
+                                r = 0;
+                        }
+                }
+        } else {
+                /*
+                 * FIXME: this read path scares me because we
+                 * always use the origin when we have a pending
+                 * exception.  However I can't think of a
+                 * situation where this is wrong - ejt.
+                 */
+                /* Do reads */
+                down_read(&s->lock);
+                /* See if it it has been remapped */
+                e = lookup_exception(&s->complete, chunk);
+                if (e)
+                        remap_exception(s, e, bio);
+                else
+                        bio->bi_bdev = s->origin->bdev;
+                up_read(&s->lock);
+        }
+        return r;
+}
+static void snapshot_resume(struct dm_target *ti)
+{
+        struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+        if (s->have_metadata)
+                return;
+        if (s->store.read_metadata(&s->store)) {
+                down_write(&s->lock);
+                s->valid = 0;
+                up_write(&s->lock);
+        }
+        s->have_metadata = 1;
+}
+static int snapshot_status(struct dm_target *ti, status_type_t type,
+                           char *result, unsigned int maxlen)
+{
+        struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
+        switch (type) {
+        case STATUSTYPE_INFO:
+                if (!snap->valid)
+                        snprintf(result, maxlen, "Invalid");
+                else {
+                        if (snap->store.fraction_full) {
+                                sector_t numerator, denominator;
+                                snap->store.fraction_full(&snap->store,
+                                                          &numerator,
+                                                          &denominator);
+                                snprintf(result, maxlen,
+                                         SECTOR_FORMAT "/" SECTOR_FORMAT,
+                                         numerator, denominator);
+                        }
+                        else
+                                snprintf(result, maxlen, "Unknown");
+                }
+                break;
+        case STATUSTYPE_TABLE:
+                /*
+                 * kdevname returns a static pointer so we need
+                 * to make private copies if the output is to
+                 * make sense.
+                 */
+                snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT,
+                         snap->origin->name, snap->cow->name,
+                         snap->type, snap->chunk_size);
+                break;
+        }
+        return 0;
+}
+/*-----------------------------------------------------------------
+ * Origin methods
+ *---------------------------------------------------------------*/
+static void list_merge(struct list_head *l1, struct list_head *l2)
+{
+        struct list_head *l1_n, *l2_p;
+        l1_n = l1->next;
+        l2_p = l2->prev;
+        l1->next = l2;
+        l2->prev = l1;
+        l2_p->next = l1_n;
+        l1_n->prev = l2_p;
+}
+static int __origin_write(struct list_head *snapshots, struct bio *bio)
+{
+        int r = 1, first = 1;
+        struct dm_snapshot *snap;
+        struct exception *e;
+        struct pending_exception *pe, *last = NULL;
+        chunk_t chunk;
+        /* Do all the snapshots on this origin */
+        list_for_each_entry (snap, snapshots, list) {
+                /* Only deal with valid snapshots */
+                if (!snap->valid)
+                        continue;
+                down_write(&snap->lock);
+                /*
+                 * Remember, different snapshots can have
+                 * different chunk sizes.
+                 */
+                chunk = sector_to_chunk(snap, bio->bi_sector);
+                /*
+                 * Check exception table to see if block
+                 * is already remapped in this snapshot
+                 * and trigger an exception if not.
+                 */
+                e = lookup_exception(&snap->complete, chunk);
+                if (!e) {
+                        pe = __find_pending_exception(snap, bio);
+                        if (!pe) {
+                                snap->store.drop_snapshot(&snap->store);
+                                snap->valid = 0;
+                        } else {
+                                if (last)
+                                        list_merge(&pe->siblings,
+                                                   &last->siblings);
+                                last = pe;
+                                r = 0;
+                        }
+                }
+                up_write(&snap->lock);
+        }
+        /*
+         * Now that we have a complete pe list we can start the copying.
+         */
+        if (last) {
+                pe = last;
+                do {
+                        down_write(&pe->snap->lock);
+                        if (first)
+                                bio_list_add(&pe->origin_bios, bio);
+                        if (!pe->started) {
+                                pe->started = 1;
+                                up_write(&pe->snap->lock);
+                                start_copy(pe);
+                        } else
+                                up_write(&pe->snap->lock);
+                        first = 0;
+                        pe = list_entry(pe->siblings.next,
+                                        struct pending_exception, siblings);
+                } while (pe != last);
+        }
+        return r;
+}
+/*
+ * Called on a write from the origin driver.
+ */
+static int do_origin(struct dm_dev *origin, struct bio *bio)
+{
+        struct origin *o;
+        int r = 1;
+        down_read(&_origins_lock);
+        o = __lookup_origin(origin->bdev);
+        if (o)
+                r = __origin_write(&o->snapshots, bio);
+        up_read(&_origins_lock);
+        return r;
+}
+/*
+ * Origin: maps a linear range of a device, with hooks for snapshotting.
+ */
+/*
+ * Construct an origin mapping: <dev_path>
+ * The context for an origin is merely a 'struct dm_dev *'
+ * pointing to the real device.
+ */
+static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+        int r;
+        struct dm_dev *dev;
+        if (argc != 1) {
+                ti->error = "dm-origin: incorrect number of arguments";
+                return -EINVAL;
+        }
+        r = dm_get_device(ti, argv[0], 0, ti->len,
+                          dm_table_get_mode(ti->table), &dev);
+        if (r) {
+                ti->error = "Cannot get target device";
+                return r;
+        }
+        ti->private = dev;
+        return 0;
+}
+static void origin_dtr(struct dm_target *ti)
+{
+        struct dm_dev *dev = (struct dm_dev *) ti->private;
+        dm_put_device(ti, dev);
+}
+static int origin_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
+{
+        struct dm_dev *dev = (struct dm_dev *) ti->private;
+        bio->bi_bdev = dev->bdev;
+        /* Only tell snapshots if this is a write */
+        return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
+}
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+/*
+ * Set the target "split_io" field to the minimum of all the snapshots'
+ * chunk sizes.
+ */
+static void origin_resume(struct dm_target *ti)
+{
+        struct dm_dev *dev = (struct dm_dev *) ti->private;
+        struct dm_snapshot *snap;
+        struct origin *o;
+        chunk_t chunk_size = 0;
+        down_read(&_origins_lock);
+        o = __lookup_origin(dev->bdev);
+        if (o)
+                list_for_each_entry (snap, &o->snapshots, list)
+                        chunk_size = min_not_zero(chunk_size, snap->chunk_size);
+        up_read(&_origins_lock);
+        ti->split_io = chunk_size;
+}
+static int origin_status(struct dm_target *ti, status_type_t type, char *result,
+                         unsigned int maxlen)
+{
+        struct dm_dev *dev = (struct dm_dev *) ti->private;
+        switch (type) {
+        case STATUSTYPE_INFO:
+                result[0] = '\0';
+                break;
+        case STATUSTYPE_TABLE:
+                snprintf(result, maxlen, "%s", dev->name);
+                break;
+        }
+        return 0;
+}
+static struct target_type origin_target = {
+        .name    = "snapshot-origin",
+        .version = {1, 0, 1},
+        .module  = THIS_MODULE,
+        .ctr     = origin_ctr,
+        .dtr     = origin_dtr,
+        .map     = origin_map,
+        .resume  = origin_resume,
+        .status  = origin_status,
+};
+static struct target_type snapshot_target = {
+        .name    = "snapshot",
+        .version = {1, 0, 1},
+        .module  = THIS_MODULE,
+        .ctr     = snapshot_ctr,
+        .dtr     = snapshot_dtr,
+        .map     = snapshot_map,
+        .resume  = snapshot_resume,
+        .status  = snapshot_status,
+};
+static int __init dm_snapshot_init(void)
+{
+        int r;
+        r = dm_register_target(&snapshot_target);
+        if (r) {
+                DMERR("snapshot target register failed %d", r);
+                return r;
+        }
+        r = dm_register_target(&origin_target);
+        if (r < 0) {
+                DMERR("Device mapper: Origin: register failed %d\n", r);
+                goto bad1;
+        }
+        r = init_origin_hash();
+        if (r) {
+                DMERR("init_origin_hash failed.");
+                goto bad2;
+        }
+        exception_cache = kmem_cache_create("dm-snapshot-ex",
+                                            sizeof(struct exception),
+                                            __alignof__(struct exception),
+                                            0, NULL, NULL);
+        if (!exception_cache) {
+                DMERR("Couldn't create exception cache.");
+                r = -ENOMEM;
+                goto bad3;
+        }
+        pending_cache =
+            kmem_cache_create("dm-snapshot-in",
+                              sizeof(struct pending_exception),
+                              __alignof__(struct pending_exception),
+                              0, NULL, NULL);
+        if (!pending_cache) {
+                DMERR("Couldn't create pending cache.");
+                r = -ENOMEM;
+                goto bad4;
+        }
+        pending_pool = mempool_create(128, mempool_alloc_slab,
+                                      mempool_free_slab, pending_cache);
+        if (!pending_pool) {
+                DMERR("Couldn't create pending pool.");
+                r = -ENOMEM;
+                goto bad5;
+        }
+        return 0;
+      bad5:
+        kmem_cache_destroy(pending_cache);
+      bad4:
+        kmem_cache_destroy(exception_cache);
+      bad3:
+        exit_origin_hash();
+      bad2:
+        dm_unregister_target(&origin_target);
+      bad1:
+        dm_unregister_target(&snapshot_target);
+        return r;
+}
+static void __exit dm_snapshot_exit(void)
+{
+        int r;
+        r = dm_unregister_target(&snapshot_target);
+        if (r)
+                DMERR("snapshot unregister failed %d", r);
+        r = dm_unregister_target(&origin_target);
+        if (r)
+                DMERR("origin unregister failed %d", r);
+        exit_origin_hash();
+        mempool_destroy(pending_pool);
+        kmem_cache_destroy(pending_cache);
+        kmem_cache_destroy(exception_cache);
+}
+/* Module hooks */
+module_init(dm_snapshot_init);
+module_exit(dm_snapshot_exit);
+MODULE_DESCRIPTION(DM_NAME " snapshot target");
+MODULE_AUTHOR("Joe Thornber");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
new file mode 100644
index 000000000000..375aa24d4d7d
--- /dev/null
+++ b/drivers/md/dm-snap.h
@@ -0,0 +1,161 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef DM_SNAPSHOT_H
+#define DM_SNAPSHOT_H
+#include "dm.h"
+#include <linux/blkdev.h>
+struct exception_table {
+        uint32_t hash_mask;
+        struct list_head *table;
+};
+/*
+ * The snapshot code deals with largish chunks of the disk at a
+ * time. Typically 64k - 256k.
+ */
+/* FIXME: can we get away with limiting these to a uint32_t ? */
+typedef sector_t chunk_t;
+/*
+ * An exception is used where an old chunk of data has been
+ * replaced by a new one.
+ */
+struct exception {
+        struct list_head hash_list;
+        chunk_t old_chunk;
+        chunk_t new_chunk;
+};
+/*
+ * Abstraction to handle the meta/layout of exception stores (the
+ * COW device).
+ */
+struct exception_store {
+        /*
+         * Destroys this object when you've finished with it.
+         */
+        void (*destroy) (struct exception_store *store);
+        /*
+         * The target shouldn't read the COW device until this is
+         * called.
+         */
+        int (*read_metadata) (struct exception_store *store);
+        /*
+         * Find somewhere to store the next exception.
+         */
+        int (*prepare_exception) (struct exception_store *store,
+                                  struct exception *e);
+        /*
+         * Update the metadata with this exception.
+         */
+        void (*commit_exception) (struct exception_store *store,
+                                  struct exception *e,
+                                  void (*callback) (void *, int success),
+                                  void *callback_context);
+        /*
+         * The snapshot is invalid, note this in the metadata.
+         */
+        void (*drop_snapshot) (struct exception_store *store);
+        /*
+         * Return how full the snapshot is.
+         */
+        void (*fraction_full) (struct exception_store *store,
+                               sector_t *numerator,
+                               sector_t *denominator);
+        struct dm_snapshot *snap;
+        void *context;
+};
+struct dm_snapshot {
+        struct rw_semaphore lock;
+        struct dm_table *table;
+        struct dm_dev *origin;
+        struct dm_dev *cow;
+        /* List of snapshots per Origin */
+        struct list_head list;
+        /* Size of data blocks saved - must be a power of 2 */
+        chunk_t chunk_size;
+        chunk_t chunk_mask;
+        chunk_t chunk_shift;
+        /* You can't use a snapshot if this is 0 (e.g. if full) */
+        int valid;
+        int have_metadata;
+        /* Used for display of table */
+        char type;
+        /* The last percentage we notified */
+        int last_percent;
+        struct exception_table pending;
+        struct exception_table complete;
+        /* The on disk metadata handler */
+        struct exception_store store;
+        struct kcopyd_client *kcopyd_client;
+};
+/*
+ * Used by the exception stores to load exceptions hen
+ * initialising.
+ */
+int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
+/*
+ * Constructor and destructor for the default persistent
+ * store.
+ */
+int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
+int dm_create_transient(struct exception_store *store,
+                        struct dm_snapshot *s, int blocksize);
+/*
+ * Return the number of sectors in the device.
+ */
+static inline sector_t get_dev_size(struct block_device *bdev)
+{
+        return bdev->bd_inode->i_size >> SECTOR_SHIFT;
+}
+static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
+{
+        return (sector & ~s->chunk_mask) >> s->chunk_shift;
+}
+static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
+{
+        return chunk << s->chunk_shift;
+}
+static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs)
+{
+        /*
+         * There is only ever one instance of a particular block
+         * device so we can compare pointers safely.
+         */
+        return lhs == rhs;
+}
+#endif
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
new file mode 100644
index 000000000000..ab89278a56bf
--- /dev/null
+++ b/drivers/md/dm-stripe.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2001-2003 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm.h"
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+struct stripe {
+        struct dm_dev *dev;
+        sector_t physical_start;
+};
+struct stripe_c {
+        uint32_t stripes;
+        /* The size of this target / num. stripes */
+        sector_t stripe_width;
+        /* stripe chunk size */
+        uint32_t chunk_shift;
+        sector_t chunk_mask;
+        struct stripe stripe[0];
+};
+static inline struct stripe_c *alloc_context(unsigned int stripes)
+{
+        size_t len;
+        if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
+                          stripes))
+                return NULL;
+        len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
+        return kmalloc(len, GFP_KERNEL);
+}
+/*
+ * Parse a single <dev> <sector> pair
+ */
+static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
+                      unsigned int stripe, char **argv)
+{
+        sector_t start;
+        if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1)
+                return -EINVAL;
+        if (dm_get_device(ti, argv[0], start, sc->stripe_width,
+                          dm_table_get_mode(ti->table),
+                          &sc->stripe[stripe].dev))
+                return -ENXIO;
+        sc->stripe[stripe].physical_start = start;
+        return 0;
+}
+/*
+ * Construct a striped mapping.
+ * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
+ */
+static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+        struct stripe_c *sc;
+        sector_t width;
+        uint32_t stripes;
+        uint32_t chunk_size;
+        char *end;
+        int r;
+        unsigned int i;
+        if (argc < 2) {
+                ti->error = "dm-stripe: Not enough arguments";
+                return -EINVAL;
+        }
+        stripes = simple_strtoul(argv[0], &end, 10);
+        if (*end) {
+                ti->error = "dm-stripe: Invalid stripe count";
+                return -EINVAL;
+        }
+        chunk_size = simple_strtoul(argv[1], &end, 10);
+        if (*end) {
+                ti->error = "dm-stripe: Invalid chunk_size";
+                return -EINVAL;
+        }
+        /*
+         * chunk_size is a power of two
+         */
+        if (!chunk_size || (chunk_size & (chunk_size - 1)) ||
+            (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
+                ti->error = "dm-stripe: Invalid chunk size";
+                return -EINVAL;
+        }
+        width = ti->len;
+        if (sector_div(width, stripes)) {
+                ti->error = "dm-stripe: Target length not divisable by "
+                    "number of stripes";
+                return -EINVAL;
+        }
+        /*
+         * Do we have enough arguments for that many stripes ?
+         */
+        if (argc != (2 + 2 * stripes)) {
+                ti->error = "dm-stripe: Not enough destinations "
+                        "specified";
+                return -EINVAL;
+        }
+        sc = alloc_context(stripes);
+        if (!sc) {
+                ti->error = "dm-stripe: Memory allocation for striped context "
+                    "failed";
+                return -ENOMEM;
+        }
+        sc->stripes = stripes;
+        sc->stripe_width = width;
+        ti->split_io = chunk_size;
+        sc->chunk_mask = ((sector_t) chunk_size) - 1;
+        for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
+                chunk_size >>= 1;
+        sc->chunk_shift--;
+        /*
+         * Get the stripe destinations.
+         */
+        for (i = 0; i < stripes; i++) {
+                argv += 2;
+                r = get_stripe(ti, sc, i, argv);
+                if (r < 0) {
+                        ti->error = "dm-stripe: Couldn't parse stripe "
+                                "destination";
+                        while (i--)
+                                dm_put_device(ti, sc->stripe[i].dev);
+                        kfree(sc);
+                        return r;
+                }
+        }
+        ti->private = sc;
+        return 0;
+}
+static void stripe_dtr(struct dm_target *ti)
+{
+        unsigned int i;
+        struct stripe_c *sc = (struct stripe_c *) ti->private;
+        for (i = 0; i < sc->stripes; i++)
+                dm_put_device(ti, sc->stripe[i].dev);
+        kfree(sc);
+}
+static int stripe_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
+{
+        struct stripe_c *sc = (struct stripe_c *) ti->private;
+        sector_t offset = bio->bi_sector - ti->begin;
+        sector_t chunk = offset >> sc->chunk_shift;
+        uint32_t stripe = sector_div(chunk, sc->stripes);
+        bio->bi_bdev = sc->stripe[stripe].dev->bdev;
+        bio->bi_sector = sc->stripe[stripe].physical_start +
+            (chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
+        return 1;
+}
+static int stripe_status(struct dm_target *ti,
+                         status_type_t type, char *result, unsigned int maxlen)
+{
+        struct stripe_c *sc = (struct stripe_c *) ti->private;
+        unsigned int sz = 0;
+        unsigned int i;
+        switch (type) {
+        case STATUSTYPE_INFO:
+                result[0] = '\0';
+                break;
+        case STATUSTYPE_TABLE:
+                DMEMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1);
+                for (i = 0; i < sc->stripes; i++)
+                        DMEMIT(" %s " SECTOR_FORMAT, sc->stripe[i].dev->name,
+                               sc->stripe[i].physical_start);
+                break;
+        }
+        return 0;
+}
+static struct target_type stripe_target = {
+        .name   = "striped",
+        .version= {1, 0, 2},
+        .module = THIS_MODULE,
+        .ctr    = stripe_ctr,
+        .dtr    = stripe_dtr,
+        .map    = stripe_map,
+        .status = stripe_status,
+};
+int __init dm_stripe_init(void)
+{
+        int r;
+        r = dm_register_target(&stripe_target);
+        if (r < 0)
+                DMWARN("striped target registration failed");
+        return r;
+}
+void dm_stripe_exit(void)
+{
+        if (dm_unregister_target(&stripe_target))
+                DMWARN("striped target unregistration failed");
+        return;
+}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
new file mode 100644
index 000000000000..ee175d4906c4
--- /dev/null
+++ b/drivers/md/dm-table.c
@@ -0,0 +1,950 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm.h"
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/namei.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#define MAX_DEPTH 16
+#define NODE_SIZE L1_CACHE_BYTES
+#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
+#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
+struct dm_table {
+        atomic_t holders;
+        /* btree table */
+        unsigned int depth;
+        unsigned int counts[MAX_DEPTH]; /* in nodes */
+        sector_t *index[MAX_DEPTH];
+        unsigned int num_targets;
+        unsigned int num_allocated;
+        sector_t *highs;
+        struct dm_target *targets;
+        /*
+         * Indicates the rw permissions for the new logical
+         * device.  This should be a combination of FMODE_READ
+         * and FMODE_WRITE.
+         */
+        int mode;
+        /* a list of devices used by this table */
+        struct list_head devices;
+        /*
+         * These are optimistic limits taken from all the
+         * targets, some targets will need smaller limits.
+         */
+        struct io_restrictions limits;
+        /* events get handed up using this callback */
+        void (*event_fn)(void *);
+        void *event_context;
+};
+/*
+ * Similar to ceiling(log_size(n))
+ */
+static unsigned int int_log(unsigned int n, unsigned int base)
+{
+        int result = 0;
+        while (n > 1) {
+                n = dm_div_up(n, base);
+                result++;
+        }
+        return result;
+}
+/*
+ * Returns the minimum that is _not_ zero, unless both are zero.
+ */
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+/*
+ * Combine two io_restrictions, always taking the lower value.
+ */
+static void combine_restrictions_low(struct io_restrictions *lhs,
+                                     struct io_restrictions *rhs)
+{
+        lhs->max_sectors =
+                min_not_zero(lhs->max_sectors, rhs->max_sectors);
+        lhs->max_phys_segments =
+                min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments);
+        lhs->max_hw_segments =
+                min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
+        lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size);
+        lhs->max_segment_size =
+                min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
+        lhs->seg_boundary_mask =
+                min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
+}
+/*
+ * Calculate the index of the child node of the n'th node k'th key.
+ */
+static inline unsigned int get_child(unsigned int n, unsigned int k)
+{
+        return (n * CHILDREN_PER_NODE) + k;
+}
+/*
+ * Return the n'th node of level l from table t.
+ */
+static inline sector_t *get_node(struct dm_table *t,
+                                 unsigned int l, unsigned int n)
+{
+        return t->index[l] + (n * KEYS_PER_NODE);
+}
+/*
+ * Return the highest key that you could lookup from the n'th
+ * node on level l of the btree.
+ */
+static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
+{
+        for (; l < t->depth - 1; l++)
+                n = get_child(n, CHILDREN_PER_NODE - 1);
+        if (n >= t->counts[l])
+                return (sector_t) - 1;
+        return get_node(t, l, n)[KEYS_PER_NODE - 1];
+}
+/*
+ * Fills in a level of the btree based on the highs of the level
+ * below it.
+ */
+static int setup_btree_index(unsigned int l, struct dm_table *t)
+{
+        unsigned int n, k;
+        sector_t *node;
+        for (n = 0U; n < t->counts[l]; n++) {
+                node = get_node(t, l, n);
+                for (k = 0U; k < KEYS_PER_NODE; k++)
+                        node[k] = high(t, l + 1, get_child(n, k));
+        }
+        return 0;
+}
+void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
+{
+        unsigned long size;
+        void *addr;
+        /*
+         * Check that we're not going to overflow.
+         */
+        if (nmemb > (ULONG_MAX / elem_size))
+                return NULL;
+        size = nmemb * elem_size;
+        addr = vmalloc(size);
+        if (addr)
+                memset(addr, 0, size);
+        return addr;
+}
+/*
+ * highs, and targets are managed as dynamic arrays during a
+ * table load.
+ */
+static int alloc_targets(struct dm_table *t, unsigned int num)
+{
+        sector_t *n_highs;
+        struct dm_target *n_targets;
+        int n = t->num_targets;
+        /*
+         * Allocate both the target array and offset array at once.
+         */
+        n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) +
+                                          sizeof(sector_t));
+        if (!n_highs)
+                return -ENOMEM;
+        n_targets = (struct dm_target *) (n_highs + num);
+        if (n) {
+                memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
+                memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
+        }
+        memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
+        vfree(t->highs);
+        t->num_allocated = num;
+        t->highs = n_highs;
+        t->targets = n_targets;
+        return 0;
+}
+int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
+{
+        struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
+        if (!t)
+                return -ENOMEM;
+        memset(t, 0, sizeof(*t));
+        INIT_LIST_HEAD(&t->devices);
+        atomic_set(&t->holders, 1);
+        if (!num_targets)
+                num_targets = KEYS_PER_NODE;
+        num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
+        if (alloc_targets(t, num_targets)) {
+                kfree(t);
+                t = NULL;
+                return -ENOMEM;
+        }
+        t->mode = mode;
+        *result = t;
+        return 0;
+}
+static void free_devices(struct list_head *devices)
+{
+        struct list_head *tmp, *next;
+        for (tmp = devices->next; tmp != devices; tmp = next) {
+                struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
+                next = tmp->next;
+                kfree(dd);
+        }
+}
+void table_destroy(struct dm_table *t)
+{
+        unsigned int i;
+        /* free the indexes (see dm_table_complete) */
+        if (t->depth >= 2)
+                vfree(t->index[t->depth - 2]);
+        /* free the targets */
+        for (i = 0; i < t->num_targets; i++) {
+                struct dm_target *tgt = t->targets + i;
+                if (tgt->type->dtr)
+                        tgt->type->dtr(tgt);
+                dm_put_target_type(tgt->type);
+        }
+        vfree(t->highs);
+        /* free the device list */
+        if (t->devices.next != &t->devices) {
+                DMWARN("devices still present during destroy: "
+                       "dm_table_remove_device calls missing");
+                free_devices(&t->devices);
+        }
+        kfree(t);
+}
+void dm_table_get(struct dm_table *t)
+{
+        atomic_inc(&t->holders);
+}
+void dm_table_put(struct dm_table *t)
+{
+        if (!t)
+                return;
+        if (atomic_dec_and_test(&t->holders))
+                table_destroy(t);
+}
+/*
+ * Checks to see if we need to extend highs or targets.
+ */
+static inline int check_space(struct dm_table *t)
+{
+        if (t->num_targets >= t->num_allocated)
+                return alloc_targets(t, t->num_allocated * 2);
+        return 0;
+}
+/*
+ * Convert a device path to a dev_t.
+ */
+static int lookup_device(const char *path, dev_t *dev)
+{
+        int r;
+        struct nameidata nd;
+        struct inode *inode;
+        if ((r = path_lookup(path, LOOKUP_FOLLOW, &nd)))
+                return r;
+        inode = nd.dentry->d_inode;
+        if (!inode) {
+                r = -ENOENT;
+                goto out;
+        }
+        if (!S_ISBLK(inode->i_mode)) {
+                r = -ENOTBLK;
+                goto out;
+        }
+        *dev = inode->i_rdev;
+ out:
+        path_release(&nd);
+        return r;
+}
+/*
+ * See if we've already got a device in the list.
+ */
+static struct dm_dev *find_device(struct list_head *l, dev_t dev)
+{
+        struct dm_dev *dd;
+        list_for_each_entry (dd, l, list)
+                if (dd->bdev->bd_dev == dev)
+                        return dd;
+        return NULL;
+}
+/*
+ * Open a device so we can use it as a map destination.
+ */
+static int open_dev(struct dm_dev *d, dev_t dev)
+{
+        static char *_claim_ptr = "I belong to device-mapper";
+        struct block_device *bdev;
+        int r;
+        if (d->bdev)
+                BUG();
+        bdev = open_by_devnum(dev, d->mode);
+        if (IS_ERR(bdev))
+                return PTR_ERR(bdev);
+        r = bd_claim(bdev, _claim_ptr);
+        if (r)
+                blkdev_put(bdev);
+        else
+                d->bdev = bdev;
+        return r;
+}
+/*
+ * Close a device that we've been using.
+ */
+static void close_dev(struct dm_dev *d)
+{
+        if (!d->bdev)
+                return;
+        bd_release(d->bdev);
+        blkdev_put(d->bdev);
+        d->bdev = NULL;
+}
+/*
+ * If possible (ie. blk_size[major] is set), this checks an area
+ * of a destination device is valid.
+ */
+static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len)
+{
+        sector_t dev_size;
+        dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT;
+        return ((start < dev_size) && (len <= (dev_size - start)));
+}
+/*
+ * This upgrades the mode on an already open dm_dev.  Being
+ * careful to leave things as they were if we fail to reopen the
+ * device.
+ */
+static int upgrade_mode(struct dm_dev *dd, int new_mode)
+{
+        int r;
+        struct dm_dev dd_copy;
+        dev_t dev = dd->bdev->bd_dev;
+        dd_copy = *dd;
+        dd->mode |= new_mode;
+        dd->bdev = NULL;
+        r = open_dev(dd, dev);
+        if (!r)
+                close_dev(&dd_copy);
+        else
+                *dd = dd_copy;
+        return r;
+}
+/*
+ * Add a device to the list, or just increment the usage count if
+ * it's already present.
+ */
+static int __table_get_device(struct dm_table *t, struct dm_target *ti,
+                              const char *path, sector_t start, sector_t len,
+                              int mode, struct dm_dev **result)
+{
+        int r;
+        dev_t dev;
+        struct dm_dev *dd;
+        unsigned int major, minor;
+        if (!t)
+                BUG();
+        if (sscanf(path, "%u:%u", &major, &minor) == 2) {
+                /* Extract the major/minor numbers */
+                dev = MKDEV(major, minor);
+                if (MAJOR(dev) != major || MINOR(dev) != minor)
+                        return -EOVERFLOW;
+        } else {
+                /* convert the path to a device */
+                if ((r = lookup_device(path, &dev)))
+                        return r;
+        }
+        dd = find_device(&t->devices, dev);
+        if (!dd) {
+                dd = kmalloc(sizeof(*dd), GFP_KERNEL);
+                if (!dd)
+                        return -ENOMEM;
+                dd->mode = mode;
+                dd->bdev = NULL;
+                if ((r = open_dev(dd, dev))) {
+                        kfree(dd);
+                        return r;
+                }
+                format_dev_t(dd->name, dev);
+                atomic_set(&dd->count, 0);
+                list_add(&dd->list, &t->devices);
+        } else if (dd->mode != (mode | dd->mode)) {
+                r = upgrade_mode(dd, mode);
+                if (r)
+                        return r;
+        }
+        atomic_inc(&dd->count);
+        if (!check_device_area(dd, start, len)) {
+                DMWARN("device %s too small for target", path);
+                dm_put_device(ti, dd);
+                return -EINVAL;
+        }
+        *result = dd;
+        return 0;
+}
+int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
+                  sector_t len, int mode, struct dm_dev **result)
+{
+        int r = __table_get_device(ti->table, ti, path,
+                                   start, len, mode, result);
+        if (!r) {
+                request_queue_t *q = bdev_get_queue((*result)->bdev);
+                struct io_restrictions *rs = &ti->limits;
+                /*
+                 * Combine the device limits low.
+                 *
+                 * FIXME: if we move an io_restriction struct
+                 *        into q this would just be a call to
+                 *        combine_restrictions_low()
+                 */
+                rs->max_sectors =
+                        min_not_zero(rs->max_sectors, q->max_sectors);
+                /* FIXME: Device-Mapper on top of RAID-0 breaks because DM
+                 *        currently doesn't honor MD's merge_bvec_fn routine.
+                 *        In this case, we'll force DM to use PAGE_SIZE or
+                 *        smaller I/O, just to be safe. A better fix is in the
+                 *        works, but add this for the time being so it will at
+                 *        least operate correctly.
+                 */
+                if (q->merge_bvec_fn)
+                        rs->max_sectors =
+                                min_not_zero(rs->max_sectors,
+                                             (unsigned short)(PAGE_SIZE >> 9));
+                rs->max_phys_segments =
+                        min_not_zero(rs->max_phys_segments,
+                                     q->max_phys_segments);
+                rs->max_hw_segments =
+                        min_not_zero(rs->max_hw_segments, q->max_hw_segments);
+                rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
+                rs->max_segment_size =
+                        min_not_zero(rs->max_segment_size, q->max_segment_size);
+                rs->seg_boundary_mask =
+                        min_not_zero(rs->seg_boundary_mask,
+                                     q->seg_boundary_mask);
+        }
+        return r;
+}
+/*
+ * Decrement a devices use count and remove it if necessary.
+ */
+void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
+{
+        if (atomic_dec_and_test(&dd->count)) {
+                close_dev(dd);
+                list_del(&dd->list);
+                kfree(dd);
+        }
+}
+/*
+ * Checks to see if the target joins onto the end of the table.
+ */
+static int adjoin(struct dm_table *table, struct dm_target *ti)
+{
+        struct dm_target *prev;
+        if (!table->num_targets)
+                return !ti->begin;
+        prev = &table->targets[table->num_targets - 1];
+        return (ti->begin == (prev->begin + prev->len));
+}
+/*
+ * Used to dynamically allocate the arg array.
+ */
+static char **realloc_argv(unsigned *array_size, char **old_argv)
+{
+        char **argv;
+        unsigned new_size;
+        new_size = *array_size ? *array_size * 2 : 64;
+        argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
+        if (argv) {
+                memcpy(argv, old_argv, *array_size * sizeof(*argv));
+                *array_size = new_size;
+        }
+        kfree(old_argv);
+        return argv;
+}
+/*
+ * Destructively splits up the argument list to pass to ctr.
+ */
+int dm_split_args(int *argc, char ***argvp, char *input)
+{
+        char *start, *end = input, *out, **argv = NULL;
+        unsigned array_size = 0;
+        *argc = 0;
+        argv = realloc_argv(&array_size, argv);
+        if (!argv)
+                return -ENOMEM;
+        while (1) {
+                start = end;
+                /* Skip whitespace */
+                while (*start && isspace(*start))
+                        start++;
+                if (!*start)
+                        break;  /* success, we hit the end */
+                /* 'out' is used to remove any back-quotes */
+                end = out = start;
+                while (*end) {
+                        /* Everything apart from '\0' can be quoted */
+                        if (*end == '\\' && *(end + 1)) {
+                                *out++ = *(end + 1);
+                                end += 2;
+                                continue;
+                        }
+                        if (isspace(*end))
+                                break;  /* end of token */
+                        *out++ = *end++;
+                }
+                /* have we already filled the array ? */
+                if ((*argc + 1) > array_size) {
+                        argv = realloc_argv(&array_size, argv);
+                        if (!argv)
+                                return -ENOMEM;
+                }
+                /* we know this is whitespace */
+                if (*end)
+                        end++;
+                /* terminate the string and put it in the array */
+                *out = '\0';
+                argv[*argc] = start;
+                (*argc)++;
+        }
+        *argvp = argv;
+        return 0;
+}
+static void check_for_valid_limits(struct io_restrictions *rs)
+{
+        if (!rs->max_sectors)
+                rs->max_sectors = MAX_SECTORS;
+        if (!rs->max_phys_segments)
+                rs->max_phys_segments = MAX_PHYS_SEGMENTS;
+        if (!rs->max_hw_segments)
+                rs->max_hw_segments = MAX_HW_SEGMENTS;
+        if (!rs->hardsect_size)
+                rs->hardsect_size = 1 << SECTOR_SHIFT;
+        if (!rs->max_segment_size)
+                rs->max_segment_size = MAX_SEGMENT_SIZE;
+        if (!rs->seg_boundary_mask)
+                rs->seg_boundary_mask = -1;
+}
+int dm_table_add_target(struct dm_table *t, const char *type,
+                        sector_t start, sector_t len, char *params)
+{
+        int r = -EINVAL, argc;
+        char **argv;
+        struct dm_target *tgt;
+        if ((r = check_space(t)))
+                return r;
+        tgt = t->targets + t->num_targets;
+        memset(tgt, 0, sizeof(*tgt));
+        if (!len) {
+                tgt->error = "zero-length target";
+                DMERR("%s", tgt->error);
+                return -EINVAL;
+        }
+        tgt->type = dm_get_target_type(type);
+        if (!tgt->type) {
+                tgt->error = "unknown target type";
+                DMERR("%s", tgt->error);
+                return -EINVAL;
+        }
+        tgt->table = t;
+        tgt->begin = start;
+        tgt->len = len;
+        tgt->error = "Unknown error";
+        /*
+         * Does this target adjoin the previous one ?
+         */
+        if (!adjoin(t, tgt)) {
+                tgt->error = "Gap in table";
+                r = -EINVAL;
+                goto bad;
+        }
+        r = dm_split_args(&argc, &argv, params);
+        if (r) {
+                tgt->error = "couldn't split parameters (insufficient memory)";
+                goto bad;
+        }
+        r = tgt->type->ctr(tgt, argc, argv);
+        kfree(argv);
+        if (r)
+                goto bad;
+        t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
+        /* FIXME: the plan is to combine high here and then have
+         * the merge fn apply the target level restrictions. */
+        combine_restrictions_low(&t->limits, &tgt->limits);
+        return 0;
+ bad:
+        DMERR("%s", tgt->error);
+        dm_put_target_type(tgt->type);
+        return r;
+}
+static int setup_indexes(struct dm_table *t)
+{
+        int i;
+        unsigned int total = 0;
+        sector_t *indexes;
+        /* allocate the space for *all* the indexes */
+        for (i = t->depth - 2; i >= 0; i--) {
+                t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
+                total += t->counts[i];
+        }
+        indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE);
+        if (!indexes)
+                return -ENOMEM;
+        /* set up internal nodes, bottom-up */
+        for (i = t->depth - 2, total = 0; i >= 0; i--) {
+                t->index[i] = indexes;
+                indexes += (KEYS_PER_NODE * t->counts[i]);
+                setup_btree_index(i, t);
+        }
+        return 0;
+}
+/*
+ * Builds the btree to index the map.
+ */
+int dm_table_complete(struct dm_table *t)
+{
+        int r = 0;
+        unsigned int leaf_nodes;
+        check_for_valid_limits(&t->limits);
+        /* how many indexes will the btree have ? */
+        leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
+        t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
+        /* leaf layer has already been set up */
+        t->counts[t->depth - 1] = leaf_nodes;
+        t->index[t->depth - 1] = t->highs;
+        if (t->depth >= 2)
+                r = setup_indexes(t);
+        return r;
+}
+static DECLARE_MUTEX(_event_lock);
+void dm_table_event_callback(struct dm_table *t,
+                             void (*fn)(void *), void *context)
+{
+        down(&_event_lock);
+        t->event_fn = fn;
+        t->event_context = context;
+        up(&_event_lock);
+}
+void dm_table_event(struct dm_table *t)
+{
+        /*
+         * You can no longer call dm_table_event() from interrupt
+         * context, use a bottom half instead.
+         */
+        BUG_ON(in_interrupt());
+        down(&_event_lock);
+        if (t->event_fn)
+                t->event_fn(t->event_context);
+        up(&_event_lock);
+}
+sector_t dm_table_get_size(struct dm_table *t)
+{
+        return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
+}
+struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
+{
+        if (index > t->num_targets)
+                return NULL;
+        return t->targets + index;
+}
+/*
+ * Search the btree for the correct target.
+ */
+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
+{
+        unsigned int l, n = 0, k = 0;
+        sector_t *node;
+        for (l = 0; l < t->depth; l++) {
+                n = get_child(n, k);
+                node = get_node(t, l, n);
+                for (k = 0; k < KEYS_PER_NODE; k++)
+                        if (node[k] >= sector)
+                                break;
+        }
+        return &t->targets[(KEYS_PER_NODE * n) + k];
+}
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
+{
+        /*
+         * Make sure we obey the optimistic sub devices
+         * restrictions.
+         */
+        blk_queue_max_sectors(q, t->limits.max_sectors);
+        q->max_phys_segments = t->limits.max_phys_segments;
+        q->max_hw_segments = t->limits.max_hw_segments;
+        q->hardsect_size = t->limits.hardsect_size;
+        q->max_segment_size = t->limits.max_segment_size;
+        q->seg_boundary_mask = t->limits.seg_boundary_mask;
+}
+unsigned int dm_table_get_num_targets(struct dm_table *t)
+{
+        return t->num_targets;
+}
+struct list_head *dm_table_get_devices(struct dm_table *t)
+{
+        return &t->devices;
+}
+int dm_table_get_mode(struct dm_table *t)
+{
+        return t->mode;
+}
+static void suspend_targets(struct dm_table *t, unsigned postsuspend)
+{
+        int i = t->num_targets;
+        struct dm_target *ti = t->targets;
+        while (i--) {
+                if (postsuspend) {
+                        if (ti->type->postsuspend)
+                                ti->type->postsuspend(ti);
+                } else if (ti->type->presuspend)
+                        ti->type->presuspend(ti);
+                ti++;
+        }
+}
+void dm_table_presuspend_targets(struct dm_table *t)
+{
+        return suspend_targets(t, 0);
+}
+void dm_table_postsuspend_targets(struct dm_table *t)
+{
+        return suspend_targets(t, 1);
+}
+void dm_table_resume_targets(struct dm_table *t)
+{
+        int i;
+        for (i = 0; i < t->num_targets; i++) {
+                struct dm_target *ti = t->targets + i;
+                if (ti->type->resume)
+                        ti->type->resume(ti);
+        }
+}
+int dm_table_any_congested(struct dm_table *t, int bdi_bits)
+{
+        struct list_head *d, *devices;
+        int r = 0;
+        devices = dm_table_get_devices(t);
+        for (d = devices->next; d != devices; d = d->next) {
+                struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+                request_queue_t *q = bdev_get_queue(dd->bdev);
+                r |= bdi_congested(&q->backing_dev_info, bdi_bits);
+        }
+        return r;
+}
+void dm_table_unplug_all(struct dm_table *t)
+{
+        struct list_head *d, *devices = dm_table_get_devices(t);
+        for (d = devices->next; d != devices; d = d->next) {
+                struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+                request_queue_t *q = bdev_get_queue(dd->bdev);
+                if (q->unplug_fn)
+                        q->unplug_fn(q);
+        }
+}
+int dm_table_flush_all(struct dm_table *t)
+{
+        struct list_head *d, *devices = dm_table_get_devices(t);
+        int ret = 0;
+        for (d = devices->next; d != devices; d = d->next) {
+                struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+                request_queue_t *q = bdev_get_queue(dd->bdev);
+                int err;
+                if (!q->issue_flush_fn)
+                        err = -EOPNOTSUPP;
+                else
+                        err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL);
+                if (!ret)
+                        ret = err;
+        }
+        return ret;
+}
+EXPORT_SYMBOL(dm_vcalloc);
+EXPORT_SYMBOL(dm_get_device);
+EXPORT_SYMBOL(dm_put_device);
+EXPORT_SYMBOL(dm_table_event);
+EXPORT_SYMBOL(dm_table_get_mode);
+EXPORT_SYMBOL(dm_table_put);
+EXPORT_SYMBOL(dm_table_get);
+EXPORT_SYMBOL(dm_table_unplug_all);
+EXPORT_SYMBOL(dm_table_flush_all);
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
new file mode 100644
index 000000000000..aecd9e0c2616
--- /dev/null
+++ b/drivers/md/dm-target.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited
+ *
+ * This file is released under the GPL.
+ */
+#include "dm.h"
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+struct tt_internal {
+        struct target_type tt;
+        struct list_head list;
+        long use;
+};
+static LIST_HEAD(_targets);
+static DECLARE_RWSEM(_lock);
+#define DM_MOD_NAME_SIZE 32
+static inline struct tt_internal *__find_target_type(const char *name)
+{
+        struct tt_internal *ti;
+        list_for_each_entry (ti, &_targets, list)
+                if (!strcmp(name, ti->tt.name))
+                        return ti;
+        return NULL;
+}
+static struct tt_internal *get_target_type(const char *name)
+{
+        struct tt_internal *ti;
+        down_read(&_lock);
+        ti = __find_target_type(name);
+        if (ti) {
+                if ((ti->use == 0) && !try_module_get(ti->tt.module))
+                        ti = NULL;
+                else
+                        ti->use++;
+        }
+        up_read(&_lock);
+        return ti;
+}
+static void load_module(const char *name)
+{
+        request_module("dm-%s", name);
+}
+struct target_type *dm_get_target_type(const char *name)
+{
+        struct tt_internal *ti = get_target_type(name);
+        if (!ti) {
+                load_module(name);
+                ti = get_target_type(name);
+        }
+        return ti ? &ti->tt : NULL;
+}
+void dm_put_target_type(struct target_type *t)
+{
+        struct tt_internal *ti = (struct tt_internal *) t;
+        down_read(&_lock);
+        if (--ti->use == 0)
+                module_put(ti->tt.module);
+        if (ti->use < 0)
+                BUG();
+        up_read(&_lock);
+        return;
+}
+static struct tt_internal *alloc_target(struct target_type *t)
+{
+        struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
+        if (ti) {
+                memset(ti, 0, sizeof(*ti));
+                ti->tt = *t;
+        }
+        return ti;
+}
+int dm_target_iterate(void (*iter_func)(struct target_type *tt,
+                                        void *param), void *param)
+{
+        struct tt_internal *ti;
+        down_read(&_lock);
+        list_for_each_entry (ti, &_targets, list)
+                iter_func(&ti->tt, param);
+        up_read(&_lock);
+        return 0;
+}
+int dm_register_target(struct target_type *t)
+{
+        int rv = 0;
+        struct tt_internal *ti = alloc_target(t);
+        if (!ti)
+                return -ENOMEM;
+        down_write(&_lock);
+        if (__find_target_type(t->name))
+                rv = -EEXIST;
+        else
+                list_add(&ti->list, &_targets);
+        up_write(&_lock);
+        if (rv)
+                kfree(ti);
+        return rv;
+}
+int dm_unregister_target(struct target_type *t)
+{
+        struct tt_internal *ti;
+        down_write(&_lock);
+        if (!(ti = __find_target_type(t->name))) {
+                up_write(&_lock);
+                return -EINVAL;
+        }
+        if (ti->use) {
+                up_write(&_lock);
+                return -ETXTBSY;
+        }
+        list_del(&ti->list);
+        kfree(ti);
+        up_write(&_lock);
+        return 0;
+}
+/*
+ * io-err: always fails an io, useful for bringing
+ * up LVs that have holes in them.
+ */
+static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
+{
+        return 0;
+}
+static void io_err_dtr(struct dm_target *ti)
+{
+        /* empty */
+}
+static int io_err_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
+{
+        return -EIO;
+}
+static struct target_type error_target = {
+        .name = "error",
+        .version = {1, 0, 1},
+        .ctr  = io_err_ctr,
+        .dtr  = io_err_dtr,
+        .map  = io_err_map,
+};
+int __init dm_target_init(void)
+{
+        return dm_register_target(&error_target);
+}
+void dm_target_exit(void)
+{
+        if (dm_unregister_target(&error_target))
+                DMWARN("error target unregistration failed");
+}
+EXPORT_SYMBOL(dm_register_target);
+EXPORT_SYMBOL(dm_unregister_target);
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
new file mode 100644
index 000000000000..7febc2cac73d
--- /dev/null
+++ b/drivers/md/dm-zero.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
+ *
+ * This file is released under the GPL.
+ */
+#include "dm.h"
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+/*
+ * Construct a dummy mapping that only returns zeros
+ */
+static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+        if (argc != 0) {
+                ti->error = "dm-zero: No arguments required";
+                return -EINVAL;
+        }
+        return 0;
+}
+/*
+ * Return zeros only on reads
+ */
+static int zero_map(struct dm_target *ti, struct bio *bio,
+                      union map_info *map_context)
+{
+        switch(bio_rw(bio)) {
+        case READ:
+                zero_fill_bio(bio);
+                break;
+        case READA:
+                /* readahead of null bytes only wastes buffer cache */
+                return -EIO;
+        case WRITE:
+                /* writes get silently dropped */
+                break;
+        }
+        bio_endio(bio, bio->bi_size, 0);
+        /* accepted bio, don't make new request */
+        return 0;
+}
+static struct target_type zero_target = {
+        .name   = "zero",
+        .version = {1, 0, 0},
+        .module = THIS_MODULE,
+        .ctr    = zero_ctr,
+        .map    = zero_map,
+};
+int __init dm_zero_init(void)
+{
+        int r = dm_register_target(&zero_target);
+        if (r < 0)
+                DMERR("zero: register failed %d", r);
+        return r;
+}
+void __exit dm_zero_exit(void)
+{
+        int r = dm_unregister_target(&zero_target);
+        if (r < 0)
+                DMERR("zero: unregister failed %d", r);
+}
+module_init(dm_zero_init)
+module_exit(dm_zero_exit)
+MODULE_AUTHOR("Christophe Saout <christophe@saout.de>");
+MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
new file mode 100644
index 000000000000..243ff6884e83
--- /dev/null
+++ b/drivers/md/dm.c
@@ -0,0 +1,1194 @@
+/*
+ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm.h"
+#include "dm-bio-list.h"
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/blkpg.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/mempool.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+static const char *_name = DM_NAME;
+static unsigned int major = 0;
+static unsigned int _major = 0;
+/*
+ * One of these is allocated per bio.
+ */
+struct dm_io {
+        struct mapped_device *md;
+        int error;
+        struct bio *bio;
+        atomic_t io_count;
+};
+/*
+ * One of these is allocated per target within a bio.  Hopefully
+ * this will be simplified out one day.
+ */
+struct target_io {
+        struct dm_io *io;
+        struct dm_target *ti;
+        union map_info info;
+};
+union map_info *dm_get_mapinfo(struct bio *bio)
+{
+        if (bio && bio->bi_private)
+                return &((struct target_io *)bio->bi_private)->info;
+        return NULL;
+}
+/*
+ * Bits for the md->flags field.
+ */
+#define DMF_BLOCK_IO 0
+#define DMF_SUSPENDED 1
+#define DMF_FS_LOCKED 2
+struct mapped_device {
+        struct rw_semaphore lock;
+        rwlock_t map_lock;
+        atomic_t holders;
+        unsigned long flags;
+        request_queue_t *queue;
+        struct gendisk *disk;
+        void *interface_ptr;
+        /*
+         * A list of ios that arrived while we were suspended.
+         */
+        atomic_t pending;
+        wait_queue_head_t wait;
+        struct bio_list deferred;
+        /*
+         * The current mapping.
+         */
+        struct dm_table *map;
+        /*
+         * io objects are allocated from here.
+         */
+        mempool_t *io_pool;
+        mempool_t *tio_pool;
+        /*
+         * Event handling.
+         */
+        atomic_t event_nr;
+        wait_queue_head_t eventq;
+        /*
+         * freeze/thaw support require holding onto a super block
+         */
+        struct super_block *frozen_sb;
+};
+#define MIN_IOS 256
+static kmem_cache_t *_io_cache;
+static kmem_cache_t *_tio_cache;
+static struct bio_set *dm_set;
+static int __init local_init(void)
+{
+        int r;
+        dm_set = bioset_create(16, 16, 4);
+        if (!dm_set)
+                return -ENOMEM;
+        /* allocate a slab for the dm_ios */
+        _io_cache = kmem_cache_create("dm_io",
+                                      sizeof(struct dm_io), 0, 0, NULL, NULL);
+        if (!_io_cache)
+                return -ENOMEM;
+        /* allocate a slab for the target ios */
+        _tio_cache = kmem_cache_create("dm_tio", sizeof(struct target_io),
+                                       0, 0, NULL, NULL);
+        if (!_tio_cache) {
+                kmem_cache_destroy(_io_cache);
+                return -ENOMEM;
+        }
+        _major = major;
+        r = register_blkdev(_major, _name);
+        if (r < 0) {
+                kmem_cache_destroy(_tio_cache);
+                kmem_cache_destroy(_io_cache);
+                return r;
+        }
+        if (!_major)
+                _major = r;
+        return 0;
+}
+static void local_exit(void)
+{
+        kmem_cache_destroy(_tio_cache);
+        kmem_cache_destroy(_io_cache);
+        bioset_free(dm_set);
+        if (unregister_blkdev(_major, _name) < 0)
+                DMERR("devfs_unregister_blkdev failed");
+        _major = 0;
+        DMINFO("cleaned up");
+}
+int (*_inits[])(void) __initdata = {
+        local_init,
+        dm_target_init,
+        dm_linear_init,
+        dm_stripe_init,
+        dm_interface_init,
+};
+void (*_exits[])(void) = {
+        local_exit,
+        dm_target_exit,
+        dm_linear_exit,
+        dm_stripe_exit,
+        dm_interface_exit,
+};
+static int __init dm_init(void)
+{
+        const int count = ARRAY_SIZE(_inits);
+        int r, i;
+        for (i = 0; i < count; i++) {
+                r = _inits[i]();
+                if (r)
+                        goto bad;
+        }
+        return 0;
+      bad:
+        while (i--)
+                _exits[i]();
+        return r;
+}
+static void __exit dm_exit(void)
+{
+        int i = ARRAY_SIZE(_exits);
+        while (i--)
+                _exits[i]();
+}
+/*
+ * Block device functions
+ */
+static int dm_blk_open(struct inode *inode, struct file *file)
+{
+        struct mapped_device *md;
+        md = inode->i_bdev->bd_disk->private_data;
+        dm_get(md);
+        return 0;
+}
+static int dm_blk_close(struct inode *inode, struct file *file)
+{
+        struct mapped_device *md;
+        md = inode->i_bdev->bd_disk->private_data;
+        dm_put(md);
+        return 0;
+}
+static inline struct dm_io *alloc_io(struct mapped_device *md)
+{
+        return mempool_alloc(md->io_pool, GFP_NOIO);
+}
+static inline void free_io(struct mapped_device *md, struct dm_io *io)
+{
+        mempool_free(io, md->io_pool);
+}
+static inline struct target_io *alloc_tio(struct mapped_device *md)
+{
+        return mempool_alloc(md->tio_pool, GFP_NOIO);
+}
+static inline void free_tio(struct mapped_device *md, struct target_io *tio)
+{
+        mempool_free(tio, md->tio_pool);
+}
+/*
+ * Add the bio to the list of deferred io.
+ */
+static int queue_io(struct mapped_device *md, struct bio *bio)
+{
+        down_write(&md->lock);
+        if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
+                up_write(&md->lock);
+                return 1;
+        }
+        bio_list_add(&md->deferred, bio);
+        up_write(&md->lock);
+        return 0;               /* deferred successfully */
+}
+/*
+ * Everyone (including functions in this file), should use this
+ * function to access the md->map field, and make sure they call
+ * dm_table_put() when finished.
+ */
+struct dm_table *dm_get_table(struct mapped_device *md)
+{
+        struct dm_table *t;
+        read_lock(&md->map_lock);
+        t = md->map;
+        if (t)
+                dm_table_get(t);
+        read_unlock(&md->map_lock);
+        return t;
+}
+/*-----------------------------------------------------------------
+ * CRUD START:
+ *   A more elegant soln is in the works that uses the queue
+ *   merge fn, unfortunately there are a couple of changes to
+ *   the block layer that I want to make for this.  So in the
+ *   interests of getting something for people to use I give
+ *   you this clearly demarcated crap.
+ *---------------------------------------------------------------*/
+/*
+ * Decrements the number of outstanding ios that a bio has been
+ * cloned into, completing the original io if necc.
+ */
+static inline void dec_pending(struct dm_io *io, int error)
+{
+        if (error)
+                io->error = error;
+        if (atomic_dec_and_test(&io->io_count)) {
+                if (atomic_dec_and_test(&io->md->pending))
+                        /* nudge anyone waiting on suspend queue */
+                        wake_up(&io->md->wait);
+                bio_endio(io->bio, io->bio->bi_size, io->error);
+                free_io(io->md, io);
+        }
+}
+static int clone_endio(struct bio *bio, unsigned int done, int error)
+{
+        int r = 0;
+        struct target_io *tio = bio->bi_private;
+        struct dm_io *io = tio->io;
+        dm_endio_fn endio = tio->ti->type->end_io;
+        if (bio->bi_size)
+                return 1;
+        if (!bio_flagged(bio, BIO_UPTODATE) && !error)
+                error = -EIO;
+        if (endio) {
+                r = endio(tio->ti, bio, error, &tio->info);
+                if (r < 0)
+                        error = r;
+                else if (r > 0)
+                        /* the target wants another shot at the io */
+                        return 1;
+        }
+        free_tio(io->md, tio);
+        dec_pending(io, error);
+        bio_put(bio);
+        return r;
+}
+static sector_t max_io_len(struct mapped_device *md,
+                           sector_t sector, struct dm_target *ti)
+{
+        sector_t offset = sector - ti->begin;
+        sector_t len = ti->len - offset;
+        /*
+         * Does the target need to split even further ?
+         */
+        if (ti->split_io) {
+                sector_t boundary;
+                boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
+                           - offset;
+                if (len > boundary)
+                        len = boundary;
+        }
+        return len;
+}
+static void __map_bio(struct dm_target *ti, struct bio *clone,
+                      struct target_io *tio)
+{
+        int r;
+        /*
+         * Sanity checks.
+         */
+        BUG_ON(!clone->bi_size);
+        clone->bi_end_io = clone_endio;
+        clone->bi_private = tio;
+        /*
+         * Map the clone.  If r == 0 we don't need to do
+         * anything, the target has assumed ownership of
+         * this io.
+         */
+        atomic_inc(&tio->io->io_count);
+        r = ti->type->map(ti, clone, &tio->info);
+        if (r > 0)
+                /* the bio has been remapped so dispatch it */
+                generic_make_request(clone);
+        else if (r < 0) {
+                /* error the io and bail out */
+                struct dm_io *io = tio->io;
+                free_tio(tio->io->md, tio);
+                dec_pending(io, -EIO);
+                bio_put(clone);
+        }
+}
+struct clone_info {
+        struct mapped_device *md;
+        struct dm_table *map;
+        struct bio *bio;
+        struct dm_io *io;
+        sector_t sector;
+        sector_t sector_count;
+        unsigned short idx;
+};
+/*
+ * Creates a little bio that is just does part of a bvec.
+ */
+static struct bio *split_bvec(struct bio *bio, sector_t sector,
+                              unsigned short idx, unsigned int offset,
+                              unsigned int len)
+{
+        struct bio *clone;
+        struct bio_vec *bv = bio->bi_io_vec + idx;
+        clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set);
+        *clone->bi_io_vec = *bv;
+        clone->bi_sector = sector;
+        clone->bi_bdev = bio->bi_bdev;
+        clone->bi_rw = bio->bi_rw;
+        clone->bi_vcnt = 1;
+        clone->bi_size = to_bytes(len);
+        clone->bi_io_vec->bv_offset = offset;
+        clone->bi_io_vec->bv_len = clone->bi_size;
+        return clone;
+}
+/*
+ * Creates a bio that consists of range of complete bvecs.
+ */
+static struct bio *clone_bio(struct bio *bio, sector_t sector,
+                             unsigned short idx, unsigned short bv_count,
+                             unsigned int len)
+{
+        struct bio *clone;
+        clone = bio_clone(bio, GFP_NOIO);
+        clone->bi_sector = sector;
+        clone->bi_idx = idx;
+        clone->bi_vcnt = idx + bv_count;
+        clone->bi_size = to_bytes(len);
+        clone->bi_flags &= ~(1 << BIO_SEG_VALID);
+        return clone;
+}
+static void __clone_and_map(struct clone_info *ci)
+{
+        struct bio *clone, *bio = ci->bio;
+        struct dm_target *ti = dm_table_find_target(ci->map, ci->sector);
+        sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
+        struct target_io *tio;
+        /*
+         * Allocate a target io object.
+         */
+        tio = alloc_tio(ci->md);
+        tio->io = ci->io;
+        tio->ti = ti;
+        memset(&tio->info, 0, sizeof(tio->info));
+        if (ci->sector_count <= max) {
+                /*
+                 * Optimise for the simple case where we can do all of
+                 * the remaining io with a single clone.
+                 */
+                clone = clone_bio(bio, ci->sector, ci->idx,
+                                  bio->bi_vcnt - ci->idx, ci->sector_count);
+                __map_bio(ti, clone, tio);
+                ci->sector_count = 0;
+        } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
+                /*
+                 * There are some bvecs that don't span targets.
+                 * Do as many of these as possible.
+                 */
+                int i;
+                sector_t remaining = max;
+                sector_t bv_len;
+                for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
+                        bv_len = to_sector(bio->bi_io_vec[i].bv_len);
+                        if (bv_len > remaining)
+                                break;
+                        remaining -= bv_len;
+                        len += bv_len;
+                }
+                clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
+                __map_bio(ti, clone, tio);
+                ci->sector += len;
+                ci->sector_count -= len;
+                ci->idx = i;
+        } else {
+                /*
+                 * Create two copy bios to deal with io that has
+                 * been split across a target.
+                 */
+                struct bio_vec *bv = bio->bi_io_vec + ci->idx;
+                clone = split_bvec(bio, ci->sector, ci->idx,
+                                   bv->bv_offset, max);
+                __map_bio(ti, clone, tio);
+                ci->sector += max;
+                ci->sector_count -= max;
+                ti = dm_table_find_target(ci->map, ci->sector);
+                len = to_sector(bv->bv_len) - max;
+                clone = split_bvec(bio, ci->sector, ci->idx,
+                                   bv->bv_offset + to_bytes(max), len);
+                tio = alloc_tio(ci->md);
+                tio->io = ci->io;
+                tio->ti = ti;
+                memset(&tio->info, 0, sizeof(tio->info));
+                __map_bio(ti, clone, tio);
+                ci->sector += len;
+                ci->sector_count -= len;
+                ci->idx++;
+        }
+}
+/*
+ * Split the bio into several clones.
+ */
+static void __split_bio(struct mapped_device *md, struct bio *bio)
+{
+        struct clone_info ci;
+        ci.map = dm_get_table(md);
+        if (!ci.map) {
+                bio_io_error(bio, bio->bi_size);
+                return;
+        }
+        ci.md = md;
+        ci.bio = bio;
+        ci.io = alloc_io(md);
+        ci.io->error = 0;
+        atomic_set(&ci.io->io_count, 1);
+        ci.io->bio = bio;
+        ci.io->md = md;
+        ci.sector = bio->bi_sector;
+        ci.sector_count = bio_sectors(bio);
+        ci.idx = bio->bi_idx;
+        atomic_inc(&md->pending);
+        while (ci.sector_count)
+                __clone_and_map(&ci);
+        /* drop the extra reference count */
+        dec_pending(ci.io, 0);
+        dm_table_put(ci.map);
+}
+/*-----------------------------------------------------------------
+ * CRUD END
+ *---------------------------------------------------------------*/
+/*
+ * The request function that just remaps the bio built up by
+ * dm_merge_bvec.
+ */
+static int dm_request(request_queue_t *q, struct bio *bio)
+{
+        int r;
+        struct mapped_device *md = q->queuedata;
+        down_read(&md->lock);
+        /*
+         * If we're suspended we have to queue
+         * this io for later.
+         */
+        while (test_bit(DMF_BLOCK_IO, &md->flags)) {
+                up_read(&md->lock);
+                if (bio_rw(bio) == READA) {
+                        bio_io_error(bio, bio->bi_size);
+                        return 0;
+                }
+                r = queue_io(md, bio);
+                if (r < 0) {
+                        bio_io_error(bio, bio->bi_size);
+                        return 0;
+                } else if (r == 0)
+                        return 0;       /* deferred successfully */
+                /*
+                 * We're in a while loop, because someone could suspend
+                 * before we get to the following read lock.
+                 */
+                down_read(&md->lock);
+        }
+        __split_bio(md, bio);
+        up_read(&md->lock);
+        return 0;
+}
+static int dm_flush_all(request_queue_t *q, struct gendisk *disk,
+                        sector_t *error_sector)
+{
+        struct mapped_device *md = q->queuedata;
+        struct dm_table *map = dm_get_table(md);
+        int ret = -ENXIO;
+        if (map) {
+                ret = dm_table_flush_all(md->map);
+                dm_table_put(map);
+        }
+        return ret;
+}
+static void dm_unplug_all(request_queue_t *q)
+{
+        struct mapped_device *md = q->queuedata;
+        struct dm_table *map = dm_get_table(md);
+        if (map) {
+                dm_table_unplug_all(map);
+                dm_table_put(map);
+        }
+}
+static int dm_any_congested(void *congested_data, int bdi_bits)
+{
+        int r;
+        struct mapped_device *md = (struct mapped_device *) congested_data;
+        struct dm_table *map = dm_get_table(md);
+        if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
+                r = bdi_bits;
+        else
+                r = dm_table_any_congested(map, bdi_bits);
+        dm_table_put(map);
+        return r;
+}
+/*-----------------------------------------------------------------
+ * An IDR is used to keep track of allocated minor numbers.
+ *---------------------------------------------------------------*/
+static DECLARE_MUTEX(_minor_lock);
+static DEFINE_IDR(_minor_idr);
+static void free_minor(unsigned int minor)
+{
+        down(&_minor_lock);
+        idr_remove(&_minor_idr, minor);
+        up(&_minor_lock);
+}
+/*
+ * See if the device with a specific minor # is free.
+ */
+static int specific_minor(struct mapped_device *md, unsigned int minor)
+{
+        int r, m;
+        if (minor >= (1 << MINORBITS))
+                return -EINVAL;
+        down(&_minor_lock);
+        if (idr_find(&_minor_idr, minor)) {
+                r = -EBUSY;
+                goto out;
+        }
+        r = idr_pre_get(&_minor_idr, GFP_KERNEL);
+        if (!r) {
+                r = -ENOMEM;
+                goto out;
+        }
+        r = idr_get_new_above(&_minor_idr, md, minor, &m);
+        if (r) {
+                goto out;
+        }
+        if (m != minor) {
+                idr_remove(&_minor_idr, m);
+                r = -EBUSY;
+                goto out;
+        }
+out:
+        up(&_minor_lock);
+        return r;
+}
+static int next_free_minor(struct mapped_device *md, unsigned int *minor)
+{
+        int r;
+        unsigned int m;
+        down(&_minor_lock);
+        r = idr_pre_get(&_minor_idr, GFP_KERNEL);
+        if (!r) {
+                r = -ENOMEM;
+                goto out;
+        }
+        r = idr_get_new(&_minor_idr, md, &m);
+        if (r) {
+                goto out;
+        }
+        if (m >= (1 << MINORBITS)) {
+                idr_remove(&_minor_idr, m);
+                r = -ENOSPC;
+                goto out;
+        }
+        *minor = m;
+out:
+        up(&_minor_lock);
+        return r;
+}
+static struct block_device_operations dm_blk_dops;
+/*
+ * Allocate and initialise a blank device with a given minor.
+ */
+static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
+{
+        int r;
+        struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
+        if (!md) {
+                DMWARN("unable to allocate device, out of memory.");
+                return NULL;
+        }
+        /* get a minor number for the dev */
+        r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor);
+        if (r < 0)
+                goto bad1;
+        memset(md, 0, sizeof(*md));
+        init_rwsem(&md->lock);
+        rwlock_init(&md->map_lock);
+        atomic_set(&md->holders, 1);
+        atomic_set(&md->event_nr, 0);
+        md->queue = blk_alloc_queue(GFP_KERNEL);
+        if (!md->queue)
+                goto bad1;
+        md->queue->queuedata = md;
+        md->queue->backing_dev_info.congested_fn = dm_any_congested;
+        md->queue->backing_dev_info.congested_data = md;
+        blk_queue_make_request(md->queue, dm_request);
+        md->queue->unplug_fn = dm_unplug_all;
+        md->queue->issue_flush_fn = dm_flush_all;
+        md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
+                                     mempool_free_slab, _io_cache);
+        if (!md->io_pool)
+                goto bad2;
+        md->tio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
+                                      mempool_free_slab, _tio_cache);
+        if (!md->tio_pool)
+                goto bad3;
+        md->disk = alloc_disk(1);
+        if (!md->disk)
+                goto bad4;
+        md->disk->major = _major;
+        md->disk->first_minor = minor;
+        md->disk->fops = &dm_blk_dops;
+        md->disk->queue = md->queue;
+        md->disk->private_data = md;
+        sprintf(md->disk->disk_name, "dm-%d", minor);
+        add_disk(md->disk);
+        atomic_set(&md->pending, 0);
+        init_waitqueue_head(&md->wait);
+        init_waitqueue_head(&md->eventq);
+        return md;
+ bad4:
+        mempool_destroy(md->tio_pool);
+ bad3:
+        mempool_destroy(md->io_pool);
+ bad2:
+        blk_put_queue(md->queue);
+        free_minor(minor);
+ bad1:
+        kfree(md);
+        return NULL;
+}
+static void free_dev(struct mapped_device *md)
+{
+        free_minor(md->disk->first_minor);
+        mempool_destroy(md->tio_pool);
+        mempool_destroy(md->io_pool);
+        del_gendisk(md->disk);
+        put_disk(md->disk);
+        blk_put_queue(md->queue);
+        kfree(md);
+}
+/*
+ * Bind a table to the device.
+ */
+static void event_callback(void *context)
+{
+        struct mapped_device *md = (struct mapped_device *) context;
+        atomic_inc(&md->event_nr);
+        wake_up(&md->eventq);
+}
+static void __set_size(struct gendisk *disk, sector_t size)
+{
+        struct block_device *bdev;
+        set_capacity(disk, size);
+        bdev = bdget_disk(disk, 0);
+        if (bdev) {
+                down(&bdev->bd_inode->i_sem);
+                i_size_write(bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
+                up(&bdev->bd_inode->i_sem);
+                bdput(bdev);
+        }
+}
+static int __bind(struct mapped_device *md, struct dm_table *t)
+{
+        request_queue_t *q = md->queue;
+        sector_t size;
+        size = dm_table_get_size(t);
+        __set_size(md->disk, size);
+        if (size == 0)
+                return 0;
+        write_lock(&md->map_lock);
+        md->map = t;
+        write_unlock(&md->map_lock);
+        dm_table_get(t);
+        dm_table_event_callback(md->map, event_callback, md);
+        dm_table_set_restrictions(t, q);
+        return 0;
+}
+static void __unbind(struct mapped_device *md)
+{
+        struct dm_table *map = md->map;
+        if (!map)
+                return;
+        dm_table_event_callback(map, NULL, NULL);
+        write_lock(&md->map_lock);
+        md->map = NULL;
+        write_unlock(&md->map_lock);
+        dm_table_put(map);
+}
+/*
+ * Constructor for a new device.
+ */
+static int create_aux(unsigned int minor, int persistent,
+                      struct mapped_device **result)
+{
+        struct mapped_device *md;
+        md = alloc_dev(minor, persistent);
+        if (!md)
+                return -ENXIO;
+        *result = md;
+        return 0;
+}
+int dm_create(struct mapped_device **result)
+{
+        return create_aux(0, 0, result);
+}
+int dm_create_with_minor(unsigned int minor, struct mapped_device **result)
+{
+        return create_aux(minor, 1, result);
+}
+void *dm_get_mdptr(dev_t dev)
+{
+        struct mapped_device *md;
+        void *mdptr = NULL;
+        unsigned minor = MINOR(dev);
+        if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
+                return NULL;
+        down(&_minor_lock);
+        md = idr_find(&_minor_idr, minor);
+        if (md && (dm_disk(md)->first_minor == minor))
+                mdptr = md->interface_ptr;
+        up(&_minor_lock);
+        return mdptr;
+}
+void dm_set_mdptr(struct mapped_device *md, void *ptr)
+{
+        md->interface_ptr = ptr;
+}
+void dm_get(struct mapped_device *md)
+{
+        atomic_inc(&md->holders);
+}
+void dm_put(struct mapped_device *md)
+{
+        struct dm_table *map = dm_get_table(md);
+        if (atomic_dec_and_test(&md->holders)) {
+                if (!test_bit(DMF_SUSPENDED, &md->flags) && map) {
+                        dm_table_presuspend_targets(map);
+                        dm_table_postsuspend_targets(map);
+                }
+                __unbind(md);
+                free_dev(md);
+        }
+        dm_table_put(map);
+}
+/*
+ * Process the deferred bios
+ */
+static void __flush_deferred_io(struct mapped_device *md, struct bio *c)
+{
+        struct bio *n;
+        while (c) {
+                n = c->bi_next;
+                c->bi_next = NULL;
+                __split_bio(md, c);
+                c = n;
+        }
+}
+/*
+ * Swap in a new table (destroying old one).
+ */
+int dm_swap_table(struct mapped_device *md, struct dm_table *table)
+{
+        int r;
+        down_write(&md->lock);
+        /* device must be suspended */
+        if (!test_bit(DMF_SUSPENDED, &md->flags)) {
+                up_write(&md->lock);
+                return -EPERM;
+        }
+        __unbind(md);
+        r = __bind(md, table);
+        if (r)
+                return r;
+        up_write(&md->lock);
+        return 0;
+}
+/*
+ * Functions to lock and unlock any filesystem running on the
+ * device.
+ */
+static int __lock_fs(struct mapped_device *md)
+{
+        struct block_device *bdev;
+        if (test_and_set_bit(DMF_FS_LOCKED, &md->flags))
+                return 0;
+        bdev = bdget_disk(md->disk, 0);
+        if (!bdev) {
+                DMWARN("bdget failed in __lock_fs");
+                return -ENOMEM;
+        }
+        WARN_ON(md->frozen_sb);
+        md->frozen_sb = freeze_bdev(bdev);
+        /* don't bdput right now, we don't want the bdev
+         * to go away while it is locked.  We'll bdput
+         * in __unlock_fs
+         */
+        return 0;
+}
+static int __unlock_fs(struct mapped_device *md)
+{
+        struct block_device *bdev;
+        if (!test_and_clear_bit(DMF_FS_LOCKED, &md->flags))
+                return 0;
+        bdev = bdget_disk(md->disk, 0);
+        if (!bdev) {
+                DMWARN("bdget failed in __unlock_fs");
+                return -ENOMEM;
+        }
+        thaw_bdev(bdev, md->frozen_sb);
+        md->frozen_sb = NULL;
+        bdput(bdev);
+        bdput(bdev);
+        return 0;
+}
+/*
+ * We need to be able to change a mapping table under a mounted
+ * filesystem.  For example we might want to move some data in
+ * the background.  Before the table can be swapped with
+ * dm_bind_table, dm_suspend must be called to flush any in
+ * flight bios and ensure that any further io gets deferred.
+ */
+int dm_suspend(struct mapped_device *md)
+{
+        struct dm_table *map;
+        DECLARE_WAITQUEUE(wait, current);
+        /* Flush I/O to the device. */
+        down_read(&md->lock);
+        if (test_bit(DMF_BLOCK_IO, &md->flags)) {
+                up_read(&md->lock);
+                return -EINVAL;
+        }
+        map = dm_get_table(md);
+        if (map)
+                dm_table_presuspend_targets(map);
+        __lock_fs(md);
+        up_read(&md->lock);
+        /*
+         * First we set the BLOCK_IO flag so no more ios will be
+         * mapped.
+         */
+        down_write(&md->lock);
+        if (test_bit(DMF_BLOCK_IO, &md->flags)) {
+                /*
+                 * If we get here we know another thread is
+                 * trying to suspend as well, so we leave the fs
+                 * locked for this thread.
+                 */
+                up_write(&md->lock);
+                return -EINVAL;
+        }
+        set_bit(DMF_BLOCK_IO, &md->flags);
+        add_wait_queue(&md->wait, &wait);
+        up_write(&md->lock);
+        /* unplug */
+        if (map) {
+                dm_table_unplug_all(map);
+                dm_table_put(map);
+        }
+        /*
+         * Then we wait for the already mapped ios to
+         * complete.
+         */
+        while (1) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (!atomic_read(&md->pending) || signal_pending(current))
+                        break;
+                io_schedule();
+        }
+        set_current_state(TASK_RUNNING);
+        down_write(&md->lock);
+        remove_wait_queue(&md->wait, &wait);
+        /* were we interrupted ? */
+        if (atomic_read(&md->pending)) {
+                __unlock_fs(md);
+                clear_bit(DMF_BLOCK_IO, &md->flags);
+                up_write(&md->lock);
+                return -EINTR;
+        }
+        set_bit(DMF_SUSPENDED, &md->flags);
+        map = dm_get_table(md);
+        if (map)
+                dm_table_postsuspend_targets(map);
+        dm_table_put(map);
+        up_write(&md->lock);
+        return 0;
+}
+int dm_resume(struct mapped_device *md)
+{
+        struct bio *def;
+        struct dm_table *map = dm_get_table(md);
+        down_write(&md->lock);
+        if (!map ||
+            !test_bit(DMF_SUSPENDED, &md->flags) ||
+            !dm_table_get_size(map)) {
+                up_write(&md->lock);
+                dm_table_put(map);
+                return -EINVAL;
+        }
+        dm_table_resume_targets(map);
+        clear_bit(DMF_SUSPENDED, &md->flags);
+        clear_bit(DMF_BLOCK_IO, &md->flags);
+        def = bio_list_get(&md->deferred);
+        __flush_deferred_io(md, def);
+        up_write(&md->lock);
+        __unlock_fs(md);
+        dm_table_unplug_all(map);
+        dm_table_put(map);
+        return 0;
+}
+/*-----------------------------------------------------------------
+ * Event notification.
+ *---------------------------------------------------------------*/
+uint32_t dm_get_event_nr(struct mapped_device *md)
+{
+        return atomic_read(&md->event_nr);
+}
+int dm_wait_event(struct mapped_device *md, int event_nr)
+{
+        return wait_event_interruptible(md->eventq,
+                        (event_nr != atomic_read(&md->event_nr)));
+}
+/*
+ * The gendisk is only valid as long as you have a reference
+ * count on 'md'.
+ */
+struct gendisk *dm_disk(struct mapped_device *md)
+{
+        return md->disk;
+}
+int dm_suspended(struct mapped_device *md)
+{
+        return test_bit(DMF_SUSPENDED, &md->flags);
+}
+static struct block_device_operations dm_blk_dops = {
+        .open = dm_blk_open,
+        .release = dm_blk_close,
+        .owner = THIS_MODULE
+};
+EXPORT_SYMBOL(dm_get_mapinfo);
+/*
+ * module hooks
+ */
+module_init(dm_init);
+module_exit(dm_exit);
+module_param(major, uint, 0);
+MODULE_PARM_DESC(major, "The major number of the device mapper");
+MODULE_DESCRIPTION(DM_NAME " driver");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
new file mode 100644
index 000000000000..e38c3fc1a1db
--- /dev/null
+++ b/drivers/md/dm.h
@@ -0,0 +1,195 @@
+/*
+ * Internal header file for device mapper
+ *
+ * Copyright (C) 2001, 2002 Sistina Software
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the LGPL.
+ */
+#ifndef DM_INTERNAL_H
+#define DM_INTERNAL_H
+#include <linux/fs.h>
+#include <linux/device-mapper.h>
+#include <linux/list.h>
+#include <linux/blkdev.h>
+#define DM_NAME "device-mapper"
+#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
+#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
+#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
+#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
+                          0 : scnprintf(result + sz, maxlen - sz, x))
+/*
+ * FIXME: I think this should be with the definition of sector_t
+ * in types.h.
+ */
+#ifdef CONFIG_LBD
+#define SECTOR_FORMAT "%Lu"
+#else
+#define SECTOR_FORMAT "%lu"
+#endif
+#define SECTOR_SHIFT 9
+/*
+ * List of devices that a metadevice uses and should open/close.
+ */
+struct dm_dev {
+        struct list_head list;
+        atomic_t count;
+        int mode;
+        struct block_device *bdev;
+        char name[16];
+};
+struct dm_table;
+struct mapped_device;
+/*-----------------------------------------------------------------
+ * Functions for manipulating a struct mapped_device.
+ * Drop the reference with dm_put when you finish with the object.
+ *---------------------------------------------------------------*/
+int dm_create(struct mapped_device **md);
+int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
+void dm_set_mdptr(struct mapped_device *md, void *ptr);
+void *dm_get_mdptr(dev_t dev);
+/*
+ * Reference counting for md.
+ */
+void dm_get(struct mapped_device *md);
+void dm_put(struct mapped_device *md);
+/*
+ * A device can still be used while suspended, but I/O is deferred.
+ */
+int dm_suspend(struct mapped_device *md);
+int dm_resume(struct mapped_device *md);
+/*
+ * The device must be suspended before calling this method.
+ */
+int dm_swap_table(struct mapped_device *md, struct dm_table *t);
+/*
+ * Drop a reference on the table when you've finished with the
+ * result.
+ */
+struct dm_table *dm_get_table(struct mapped_device *md);
+/*
+ * Event functions.
+ */
+uint32_t dm_get_event_nr(struct mapped_device *md);
+int dm_wait_event(struct mapped_device *md, int event_nr);
+/*
+ * Info functions.
+ */
+struct gendisk *dm_disk(struct mapped_device *md);
+int dm_suspended(struct mapped_device *md);
+/*-----------------------------------------------------------------
+ * Functions for manipulating a table.  Tables are also reference
+ * counted.
+ *---------------------------------------------------------------*/
+int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
+void dm_table_get(struct dm_table *t);
+void dm_table_put(struct dm_table *t);
+int dm_table_add_target(struct dm_table *t, const char *type,
+                        sector_t start, sector_t len, char *params);
+int dm_table_complete(struct dm_table *t);
+void dm_table_event_callback(struct dm_table *t,
+                             void (*fn)(void *), void *context);
+void dm_table_event(struct dm_table *t);
+sector_t dm_table_get_size(struct dm_table *t);
+struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
+unsigned int dm_table_get_num_targets(struct dm_table *t);
+struct list_head *dm_table_get_devices(struct dm_table *t);
+int dm_table_get_mode(struct dm_table *t);
+void dm_table_presuspend_targets(struct dm_table *t);
+void dm_table_postsuspend_targets(struct dm_table *t);
+void dm_table_resume_targets(struct dm_table *t);
+int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+void dm_table_unplug_all(struct dm_table *t);
+int dm_table_flush_all(struct dm_table *t);
+/*-----------------------------------------------------------------
+ * A registry of target types.
+ *---------------------------------------------------------------*/
+int dm_target_init(void);
+void dm_target_exit(void);
+struct target_type *dm_get_target_type(const char *name);
+void dm_put_target_type(struct target_type *t);
+int dm_target_iterate(void (*iter_func)(struct target_type *tt,
+                                        void *param), void *param);
+/*-----------------------------------------------------------------
+ * Useful inlines.
+ *---------------------------------------------------------------*/
+static inline int array_too_big(unsigned long fixed, unsigned long obj,
+                                unsigned long num)
+{
+        return (num > (ULONG_MAX - fixed) / obj);
+}
+/*
+ * Ceiling(n / sz)
+ */
+#define dm_div_up(n, sz) (((n) + (sz) - 1) / (sz))
+#define dm_sector_div_up(n, sz) ( \
+{ \
+        sector_t _r = ((n) + (sz) - 1); \
+        sector_div(_r, (sz)); \
+        _r; \
+} \
+)
+/*
+ * ceiling(n / size) * size
+ */
+#define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz))
+static inline sector_t to_sector(unsigned long n)
+{
+        return (n >> 9);
+}
+static inline unsigned long to_bytes(sector_t n)
+{
+        return (n << 9);
+}
+int dm_split_args(int *argc, char ***argvp, char *input);
+/*
+ * The device-mapper can be driven through one of two interfaces;
+ * ioctl or filesystem, depending which patch you have applied.
+ */
+int dm_interface_init(void);
+void dm_interface_exit(void);
+/*
+ * Targets for linear and striped mappings
+ */
+int dm_linear_init(void);
+void dm_linear_exit(void);
+int dm_stripe_init(void);
+void dm_stripe_exit(void);
+void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
+union map_info *dm_get_mapinfo(struct bio *bio);
+#endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
new file mode 100644
index 000000000000..0248f8e7eac0
--- /dev/null
+++ b/drivers/md/faulty.c
@@ -0,0 +1,343 @@
+/*
+ * faulty.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 2004 Neil Brown
+ *
+ * fautly-device-simulator personality for md
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * The "faulty" personality causes some requests to fail.
+ *
+ * Possible failure modes are:
+ *   reads fail "randomly" but succeed on retry
+ *   writes fail "randomly" but succeed on retry
+ *   reads for some address fail and then persist until a write
+ *   reads for some address fail and then persist irrespective of write
+ *   writes for some address fail and persist
+ *   all writes fail
+ *
+ * Different modes can be active at a time, but only
+ * one can be set at array creation.  Others can be added later.
+ * A mode can be one-shot or recurrent with the recurrance being
+ * once in every N requests.
+ * The bottom 5 bits of the "layout" indicate the mode.  The
+ * remainder indicate a period, or 0 for one-shot.
+ *
+ * There is an implementation limit on the number of concurrently
+ * persisting-faulty blocks. When a new fault is requested that would
+ * exceed the limit, it is ignored.
+ * All current faults can be clear using a layout of "0".
+ *
+ * Requests are always sent to the device.  If they are to fail,
+ * we clone the bio and insert a new b_end_io into the chain.
+ */
+#define WriteTransient  0
+#define ReadTransient   1
+#define WritePersistent 2
+#define ReadPersistent  3
+#define WriteAll        4 /* doesn't go to device */
+#define ReadFixable     5
+#define Modes   6
+#define ClearErrors     31
+#define ClearFaults     30
+#define AllPersist      100 /* internal use only */
+#define NoPersist       101
+#define ModeMask        0x1f
+#define ModeShift       5
+#define MaxFault        50
+#include <linux/raid/md.h>
+static int faulty_fail(struct bio *bio, unsigned int bytes_done, int error)
+{
+        struct bio *b = bio->bi_private;
+        b->bi_size = bio->bi_size;
+        b->bi_sector = bio->bi_sector;
+        if (bio->bi_size == 0)
+                bio_put(bio);
+        clear_bit(BIO_UPTODATE, &b->bi_flags);
+        return (b->bi_end_io)(b, bytes_done, -EIO);
+}
+typedef struct faulty_conf {
+        int period[Modes];
+        atomic_t counters[Modes];
+        sector_t faults[MaxFault];
+        int     modes[MaxFault];
+        int nfaults;
+        mdk_rdev_t *rdev;
+} conf_t;
+static int check_mode(conf_t *conf, int mode)
+{
+        if (conf->period[mode] == 0 &&
+            atomic_read(&conf->counters[mode]) <= 0)
+                return 0; /* no failure, no decrement */
+        if (atomic_dec_and_test(&conf->counters[mode])) {
+                if (conf->period[mode])
+                        atomic_set(&conf->counters[mode], conf->period[mode]);
+                return 1;
+        }
+        return 0;
+}
+static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir)
+{
+        /* If we find a ReadFixable sector, we fix it ... */
+        int i;
+        for (i=0; i<conf->nfaults; i++)
+                if (conf->faults[i] >= start &&
+                    conf->faults[i] < end) {
+                        /* found it ... */
+                        switch (conf->modes[i] * 2 + dir) {
+                        case WritePersistent*2+WRITE: return 1;
+                        case ReadPersistent*2+READ: return 1;
+                        case ReadFixable*2+READ: return 1;
+                        case ReadFixable*2+WRITE:
+                                conf->modes[i] = NoPersist;
+                                return 0;
+                        case AllPersist*2+READ:
+                        case AllPersist*2+WRITE: return 1;
+                        default:
+                                return 0;
+                        }
+                }
+        return 0;
+}
+static void add_sector(conf_t *conf, sector_t start, int mode)
+{
+        int i;
+        int n = conf->nfaults;
+        for (i=0; i<conf->nfaults; i++)
+                if (conf->faults[i] == start) {
+                        switch(mode) {
+                        case NoPersist: conf->modes[i] = mode; return;
+                        case WritePersistent:
+                                if (conf->modes[i] == ReadPersistent ||
+                                    conf->modes[i] == ReadFixable)
+                                        conf->modes[i] = AllPersist;
+                                else
+                                        conf->modes[i] = WritePersistent;
+                                return;
+                        case ReadPersistent:
+                                if (conf->modes[i] == WritePersistent)
+                                        conf->modes[i] = AllPersist;
+                                else
+                                        conf->modes[i] = ReadPersistent;
+                                return;
+                        case ReadFixable:
+                                if (conf->modes[i] == WritePersistent ||
+                                    conf->modes[i] == ReadPersistent)
+                                        conf->modes[i] = AllPersist;
+                                else
+                                        conf->modes[i] = ReadFixable;
+                                return;
+                        }
+                } else if (conf->modes[i] == NoPersist)
+                        n = i;
+        if (n >= MaxFault)
+                return;
+        conf->faults[n] = start;
+        conf->modes[n] = mode;
+        if (conf->nfaults == n)
+                conf->nfaults = n+1;
+}
+static int make_request(request_queue_t *q, struct bio *bio)
+{
+        mddev_t *mddev = q->queuedata;
+        conf_t *conf = (conf_t*)mddev->private;
+        int failit = 0;
+        if (bio->bi_rw & 1) {
+                /* write request */
+                if (atomic_read(&conf->counters[WriteAll])) {
+                        /* special case - don't decrement, don't generic_make_request,
+                         * just fail immediately
+                         */
+                        bio_endio(bio, bio->bi_size, -EIO);
+                        return 0;
+                }
+                if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
+                                 WRITE))
+                        failit = 1;
+                if (check_mode(conf, WritePersistent)) {
+                        add_sector(conf, bio->bi_sector, WritePersistent);
+                        failit = 1;
+                }
+                if (check_mode(conf, WriteTransient))
+                        failit = 1;
+        } else {
+                /* read request */
+                if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9),
+                                 READ))
+                        failit = 1;
+                if (check_mode(conf, ReadTransient))
+                        failit = 1;
+                if (check_mode(conf, ReadPersistent)) {
+                        add_sector(conf, bio->bi_sector, ReadPersistent);
+                        failit = 1;
+                }
+                if (check_mode(conf, ReadFixable)) {
+                        add_sector(conf, bio->bi_sector, ReadFixable);
+                        failit = 1;
+                }
+        }
+        if (failit) {
+                struct bio *b = bio_clone(bio, GFP_NOIO);
+                b->bi_bdev = conf->rdev->bdev;
+                b->bi_private = bio;
+                b->bi_end_io = faulty_fail;
+                generic_make_request(b);
+                return 0;
+        } else {
+                bio->bi_bdev = conf->rdev->bdev;
+                return 1;
+        }
+}
+static void status(struct seq_file *seq, mddev_t *mddev)
+{
+        conf_t *conf = (conf_t*)mddev->private;
+        int n;
+        if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
+                seq_printf(seq, " WriteTransient=%d(%d)",
+                           n, conf->period[WriteTransient]);
+        if ((n=atomic_read(&conf->counters[ReadTransient])) != 0)
+                seq_printf(seq, " ReadTransient=%d(%d)",
+                           n, conf->period[ReadTransient]);
+        if ((n=atomic_read(&conf->counters[WritePersistent])) != 0)
+                seq_printf(seq, " WritePersistent=%d(%d)",
+                           n, conf->period[WritePersistent]);
+        if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0)
+                seq_printf(seq, " ReadPersistent=%d(%d)",
+                           n, conf->period[ReadPersistent]);
+        if ((n=atomic_read(&conf->counters[ReadFixable])) != 0)
+                seq_printf(seq, " ReadFixable=%d(%d)",
+                           n, conf->period[ReadFixable]);
+        if ((n=atomic_read(&conf->counters[WriteAll])) != 0)
+                seq_printf(seq, " WriteAll");
+        seq_printf(seq, " nfaults=%d", conf->nfaults);
+}
+static int reconfig(mddev_t *mddev, int layout, int chunk_size)
+{
+        int mode = layout & ModeMask;
+        int count = layout >> ModeShift;
+        conf_t *conf = mddev->private;
+        if (chunk_size != -1)
+                return -EINVAL;
+        /* new layout */
+        if (mode == ClearFaults)
+                conf->nfaults = 0;
+        else if (mode == ClearErrors) {
+                int i;
+                for (i=0 ; i < Modes ; i++) {
+                        conf->period[i] = 0;
+                        atomic_set(&conf->counters[i], 0);
+                }
+        } else if (mode < Modes) {
+                conf->period[mode] = count;
+                if (!count) count++;
+                atomic_set(&conf->counters[mode], count);
+        } else
+                return -EINVAL;
+        mddev->layout = -1; /* makes sure further changes come through */
+        return 0;
+}
+static int run(mddev_t *mddev)
+{
+        mdk_rdev_t *rdev;
+        struct list_head *tmp;
+        int i;
+        conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL);
+        for (i=0; i<Modes; i++) {
+                atomic_set(&conf->counters[i], 0);
+                conf->period[i] = 0;
+        }
+        conf->nfaults = 0;
+        ITERATE_RDEV(mddev, rdev, tmp)
+                conf->rdev = rdev;
+        mddev->array_size = mddev->size;
+        mddev->private = conf;
+        reconfig(mddev, mddev->layout, -1);
+        return 0;
+}
+static int stop(mddev_t *mddev)
+{
+        conf_t *conf = (conf_t *)mddev->private;
+        kfree(conf);
+        mddev->private = NULL;
+        return 0;
+}
+static mdk_personality_t faulty_personality =
+{
+        .name           = "faulty",
+        .owner          = THIS_MODULE,
+        .make_request   = make_request,
+        .run            = run,
+        .stop           = stop,
+        .status         = status,
+        .reconfig       = reconfig,
+};
+static int __init raid_init(void)
+{
+        return register_md_personality(FAULTY, &faulty_personality);
+}
+static void raid_exit(void)
+{
+        unregister_md_personality(FAULTY);
+}
+module_init(raid_init);
+module_exit(raid_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-10"); /* faulty */
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
new file mode 100644
index 000000000000..eb7036485975
--- /dev/null
+++ b/drivers/md/kcopyd.c
@@ -0,0 +1,687 @@
+/*
+ * Copyright (C) 2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ *
+ * Kcopyd provides a simple interface for copying an area of one
+ * block-device to one or more other block-devices, with an asynchronous
+ * completion notification.
+ */
+#include <asm/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include "kcopyd.h"
+static struct workqueue_struct *_kcopyd_wq;
+static struct work_struct _kcopyd_work;
+static inline void wake(void)
+{
+        queue_work(_kcopyd_wq, &_kcopyd_work);
+}
+/*-----------------------------------------------------------------
+ * Each kcopyd client has its own little pool of preallocated
+ * pages for kcopyd io.
+ *---------------------------------------------------------------*/
+struct kcopyd_client {
+        struct list_head list;
+        spinlock_t lock;
+        struct page_list *pages;
+        unsigned int nr_pages;
+        unsigned int nr_free_pages;
+};
+static struct page_list *alloc_pl(void)
+{
+        struct page_list *pl;
+        pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+        if (!pl)
+                return NULL;
+        pl->page = alloc_page(GFP_KERNEL);
+        if (!pl->page) {
+                kfree(pl);
+                return NULL;
+        }
+        return pl;
+}
+static void free_pl(struct page_list *pl)
+{
+        __free_page(pl->page);
+        kfree(pl);
+}
+static int kcopyd_get_pages(struct kcopyd_client *kc,
+                            unsigned int nr, struct page_list **pages)
+{
+        struct page_list *pl;
+        spin_lock(&kc->lock);
+        if (kc->nr_free_pages < nr) {
+                spin_unlock(&kc->lock);
+                return -ENOMEM;
+        }
+        kc->nr_free_pages -= nr;
+        for (*pages = pl = kc->pages; --nr; pl = pl->next)
+                ;
+        kc->pages = pl->next;
+        pl->next = NULL;
+        spin_unlock(&kc->lock);
+        return 0;
+}
+static void kcopyd_put_pages(struct kcopyd_client *kc, struct page_list *pl)
+{
+        struct page_list *cursor;
+        spin_lock(&kc->lock);
+        for (cursor = pl; cursor->next; cursor = cursor->next)
+                kc->nr_free_pages++;
+        kc->nr_free_pages++;
+        cursor->next = kc->pages;
+        kc->pages = pl;
+        spin_unlock(&kc->lock);
+}
+/*
+ * These three functions resize the page pool.
+ */
+static void drop_pages(struct page_list *pl)
+{
+        struct page_list *next;
+        while (pl) {
+                next = pl->next;
+                free_pl(pl);
+                pl = next;
+        }
+}
+static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
+{
+        unsigned int i;
+        struct page_list *pl = NULL, *next;
+        for (i = 0; i < nr; i++) {
+                next = alloc_pl();
+                if (!next) {
+                        if (pl)
+                                drop_pages(pl);
+                        return -ENOMEM;
+                }
+                next->next = pl;
+                pl = next;
+        }
+        kcopyd_put_pages(kc, pl);
+        kc->nr_pages += nr;
+        return 0;
+}
+static void client_free_pages(struct kcopyd_client *kc)
+{
+        BUG_ON(kc->nr_free_pages != kc->nr_pages);
+        drop_pages(kc->pages);
+        kc->pages = NULL;
+        kc->nr_free_pages = kc->nr_pages = 0;
+}
+/*-----------------------------------------------------------------
+ * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
+ * for this reason we use a mempool to prevent the client from
+ * ever having to do io (which could cause a deadlock).
+ *---------------------------------------------------------------*/
+struct kcopyd_job {
+        struct kcopyd_client *kc;
+        struct list_head list;
+        unsigned long flags;
+        /*
+         * Error state of the job.
+         */
+        int read_err;
+        unsigned int write_err;
+        /*
+         * Either READ or WRITE
+         */
+        int rw;
+        struct io_region source;
+        /*
+         * The destinations for the transfer.
+         */
+        unsigned int num_dests;
+        struct io_region dests[KCOPYD_MAX_REGIONS];
+        sector_t offset;
+        unsigned int nr_pages;
+        struct page_list *pages;
+        /*
+         * Set this to ensure you are notified when the job has
+         * completed.  'context' is for callback to use.
+         */
+        kcopyd_notify_fn fn;
+        void *context;
+        /*
+         * These fields are only used if the job has been split
+         * into more manageable parts.
+         */
+        struct semaphore lock;
+        atomic_t sub_jobs;
+        sector_t progress;
+};
+/* FIXME: this should scale with the number of pages */
+#define MIN_JOBS 512
+static kmem_cache_t *_job_cache;
+static mempool_t *_job_pool;
+/*
+ * We maintain three lists of jobs:
+ *
+ * i)   jobs waiting for pages
+ * ii)  jobs that have pages, and are waiting for the io to be issued.
+ * iii) jobs that have completed.
+ *
+ * All three of these are protected by job_lock.
+ */
+static DEFINE_SPINLOCK(_job_lock);
+static LIST_HEAD(_complete_jobs);
+static LIST_HEAD(_io_jobs);
+static LIST_HEAD(_pages_jobs);
+static int jobs_init(void)
+{
+        _job_cache = kmem_cache_create("kcopyd-jobs",
+                                       sizeof(struct kcopyd_job),
+                                       __alignof__(struct kcopyd_job),
+                                       0, NULL, NULL);
+        if (!_job_cache)
+                return -ENOMEM;
+        _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
+                                   mempool_free_slab, _job_cache);
+        if (!_job_pool) {
+                kmem_cache_destroy(_job_cache);
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void jobs_exit(void)
+{
+        BUG_ON(!list_empty(&_complete_jobs));
+        BUG_ON(!list_empty(&_io_jobs));
+        BUG_ON(!list_empty(&_pages_jobs));
+        mempool_destroy(_job_pool);
+        kmem_cache_destroy(_job_cache);
+        _job_pool = NULL;
+        _job_cache = NULL;
+}
+/*
+ * Functions to push and pop a job onto the head of a given job
+ * list.
+ */
+static inline struct kcopyd_job *pop(struct list_head *jobs)
+{
+        struct kcopyd_job *job = NULL;
+        unsigned long flags;
+        spin_lock_irqsave(&_job_lock, flags);
+        if (!list_empty(jobs)) {
+                job = list_entry(jobs->next, struct kcopyd_job, list);
+                list_del(&job->list);
+        }
+        spin_unlock_irqrestore(&_job_lock, flags);
+        return job;
+}
+static inline void push(struct list_head *jobs, struct kcopyd_job *job)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&_job_lock, flags);
+        list_add_tail(&job->list, jobs);
+        spin_unlock_irqrestore(&_job_lock, flags);
+}
+/*
+ * These three functions process 1 item from the corresponding
+ * job list.
+ *
+ * They return:
+ * < 0: error
+ *   0: success
+ * > 0: can't process yet.
+ */
+static int run_complete_job(struct kcopyd_job *job)
+{
+        void *context = job->context;
+        int read_err = job->read_err;
+        unsigned int write_err = job->write_err;
+        kcopyd_notify_fn fn = job->fn;
+        kcopyd_put_pages(job->kc, job->pages);
+        mempool_free(job, _job_pool);
+        fn(read_err, write_err, context);
+        return 0;
+}
+static void complete_io(unsigned long error, void *context)
+{
+        struct kcopyd_job *job = (struct kcopyd_job *) context;
+        if (error) {
+                if (job->rw == WRITE)
+                        job->write_err &= error;
+                else
+                        job->read_err = 1;
+                if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
+                        push(&_complete_jobs, job);
+                        wake();
+                        return;
+                }
+        }
+        if (job->rw == WRITE)
+                push(&_complete_jobs, job);
+        else {
+                job->rw = WRITE;
+                push(&_io_jobs, job);
+        }
+        wake();
+}
+/*
+ * Request io on as many buffer heads as we can currently get for
+ * a particular job.
+ */
+static int run_io_job(struct kcopyd_job *job)
+{
+        int r;
+        if (job->rw == READ)
+                r = dm_io_async(1, &job->source, job->rw,
+                                job->pages,
+                                job->offset, complete_io, job);
+        else
+                r = dm_io_async(job->num_dests, job->dests, job->rw,
+                                job->pages,
+                                job->offset, complete_io, job);
+        return r;
+}
+static int run_pages_job(struct kcopyd_job *job)
+{
+        int r;
+        job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
+                                  PAGE_SIZE >> 9);
+        r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
+        if (!r) {
+                /* this job is ready for io */
+                push(&_io_jobs, job);
+                return 0;
+        }
+        if (r == -ENOMEM)
+                /* can't complete now */
+                return 1;
+        return r;
+}
+/*
+ * Run through a list for as long as possible.  Returns the count
+ * of successful jobs.
+ */
+static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
+{
+        struct kcopyd_job *job;
+        int r, count = 0;
+        while ((job = pop(jobs))) {
+                r = fn(job);
+                if (r < 0) {
+                        /* error this rogue job */
+                        if (job->rw == WRITE)
+                                job->write_err = (unsigned int) -1;
+                        else
+                                job->read_err = 1;
+                        push(&_complete_jobs, job);
+                        break;
+                }
+                if (r > 0) {
+                        /*
+                         * We couldn't service this job ATM, so
+                         * push this job back onto the list.
+                         */
+                        push(jobs, job);
+                        break;
+                }
+                count++;
+        }
+        return count;
+}
+/*
+ * kcopyd does this every time it's woken up.
+ */
+static void do_work(void *ignored)
+{
+        /*
+         * The order that these are called is *very* important.
+         * complete jobs can free some pages for pages jobs.
+         * Pages jobs when successful will jump onto the io jobs
+         * list.  io jobs call wake when they complete and it all
+         * starts again.
+         */
+        process_jobs(&_complete_jobs, run_complete_job);
+        process_jobs(&_pages_jobs, run_pages_job);
+        process_jobs(&_io_jobs, run_io_job);
+}
+/*
+ * If we are copying a small region we just dispatch a single job
+ * to do the copy, otherwise the io has to be split up into many
+ * jobs.
+ */
+static void dispatch_job(struct kcopyd_job *job)
+{
+        push(&_pages_jobs, job);
+        wake();
+}
+#define SUB_JOB_SIZE 128
+static void segment_complete(int read_err,
+                             unsigned int write_err, void *context)
+{
+        /* FIXME: tidy this function */
+        sector_t progress = 0;
+        sector_t count = 0;
+        struct kcopyd_job *job = (struct kcopyd_job *) context;
+        down(&job->lock);
+        /* update the error */
+        if (read_err)
+                job->read_err = 1;
+        if (write_err)
+                job->write_err &= write_err;
+        /*
+         * Only dispatch more work if there hasn't been an error.
+         */
+        if ((!job->read_err && !job->write_err) ||
+            test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
+                /* get the next chunk of work */
+                progress = job->progress;
+                count = job->source.count - progress;
+                if (count) {
+                        if (count > SUB_JOB_SIZE)
+                                count = SUB_JOB_SIZE;
+                        job->progress += count;
+                }
+        }
+        up(&job->lock);
+        if (count) {
+                int i;
+                struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
+                *sub_job = *job;
+                sub_job->source.sector += progress;
+                sub_job->source.count = count;
+                for (i = 0; i < job->num_dests; i++) {
+                        sub_job->dests[i].sector += progress;
+                        sub_job->dests[i].count = count;
+                }
+                sub_job->fn = segment_complete;
+                sub_job->context = job;
+                dispatch_job(sub_job);
+        } else if (atomic_dec_and_test(&job->sub_jobs)) {
+                /*
+                 * To avoid a race we must keep the job around
+                 * until after the notify function has completed.
+                 * Otherwise the client may try and stop the job
+                 * after we've completed.
+                 */
+                job->fn(read_err, write_err, job->context);
+                mempool_free(job, _job_pool);
+        }
+}
+/*
+ * Create some little jobs that will do the move between
+ * them.
+ */
+#define SPLIT_COUNT 8
+static void split_job(struct kcopyd_job *job)
+{
+        int i;
+        atomic_set(&job->sub_jobs, SPLIT_COUNT);
+        for (i = 0; i < SPLIT_COUNT; i++)
+                segment_complete(0, 0u, job);
+}
+int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
+                unsigned int num_dests, struct io_region *dests,
+                unsigned int flags, kcopyd_notify_fn fn, void *context)
+{
+        struct kcopyd_job *job;
+        /*
+         * Allocate a new job.
+         */
+        job = mempool_alloc(_job_pool, GFP_NOIO);
+        /*
+         * set up for the read.
+         */
+        job->kc = kc;
+        job->flags = flags;
+        job->read_err = 0;
+        job->write_err = 0;
+        job->rw = READ;
+        job->source = *from;
+        job->num_dests = num_dests;
+        memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
+        job->offset = 0;
+        job->nr_pages = 0;
+        job->pages = NULL;
+        job->fn = fn;
+        job->context = context;
+        if (job->source.count < SUB_JOB_SIZE)
+                dispatch_job(job);
+        else {
+                init_MUTEX(&job->lock);
+                job->progress = 0;
+                split_job(job);
+        }
+        return 0;
+}
+/*
+ * Cancels a kcopyd job, eg. someone might be deactivating a
+ * mirror.
+ */
+int kcopyd_cancel(struct kcopyd_job *job, int block)
+{
+        /* FIXME: finish */
+        return -1;
+}
+/*-----------------------------------------------------------------
+ * Unit setup
+ *---------------------------------------------------------------*/
+static DECLARE_MUTEX(_client_lock);
+static LIST_HEAD(_clients);
+static void client_add(struct kcopyd_client *kc)
+{
+        down(&_client_lock);
+        list_add(&kc->list, &_clients);
+        up(&_client_lock);
+}
+static void client_del(struct kcopyd_client *kc)
+{
+        down(&_client_lock);
+        list_del(&kc->list);
+        up(&_client_lock);
+}
+static DECLARE_MUTEX(kcopyd_init_lock);
+static int kcopyd_clients = 0;
+static int kcopyd_init(void)
+{
+        int r;
+        down(&kcopyd_init_lock);
+        if (kcopyd_clients) {
+                /* Already initialized. */
+                kcopyd_clients++;
+                up(&kcopyd_init_lock);
+                return 0;
+        }
+        r = jobs_init();
+        if (r) {
+                up(&kcopyd_init_lock);
+                return r;
+        }
+        _kcopyd_wq = create_singlethread_workqueue("kcopyd");
+        if (!_kcopyd_wq) {
+                jobs_exit();
+                up(&kcopyd_init_lock);
+                return -ENOMEM;
+        }
+        kcopyd_clients++;
+        INIT_WORK(&_kcopyd_work, do_work, NULL);
+        up(&kcopyd_init_lock);
+        return 0;
+}
+static void kcopyd_exit(void)
+{
+        down(&kcopyd_init_lock);
+        kcopyd_clients--;
+        if (!kcopyd_clients) {
+                jobs_exit();
+                destroy_workqueue(_kcopyd_wq);
+                _kcopyd_wq = NULL;
+        }
+        up(&kcopyd_init_lock);
+}
+int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
+{
+        int r = 0;
+        struct kcopyd_client *kc;
+        r = kcopyd_init();
+        if (r)
+                return r;
+        kc = kmalloc(sizeof(*kc), GFP_KERNEL);
+        if (!kc) {
+                kcopyd_exit();
+                return -ENOMEM;
+        }
+        spin_lock_init(&kc->lock);
+        kc->pages = NULL;
+        kc->nr_pages = kc->nr_free_pages = 0;
+        r = client_alloc_pages(kc, nr_pages);
+        if (r) {
+                kfree(kc);
+                kcopyd_exit();
+                return r;
+        }
+        r = dm_io_get(nr_pages);
+        if (r) {
+                client_free_pages(kc);
+                kfree(kc);
+                kcopyd_exit();
+                return r;
+        }
+        client_add(kc);
+        *result = kc;
+        return 0;
+}
+void kcopyd_client_destroy(struct kcopyd_client *kc)
+{
+        dm_io_put(kc->nr_pages);
+        client_free_pages(kc);
+        client_del(kc);
+        kfree(kc);
+        kcopyd_exit();
+}
+EXPORT_SYMBOL(kcopyd_client_create);
+EXPORT_SYMBOL(kcopyd_client_destroy);
+EXPORT_SYMBOL(kcopyd_copy);
+EXPORT_SYMBOL(kcopyd_cancel);
diff --git a/drivers/md/kcopyd.h b/drivers/md/kcopyd.h
new file mode 100644
index 000000000000..4621ea055c0e
--- /dev/null
+++ b/drivers/md/kcopyd.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2001 Sistina Software
+ *
+ * This file is released under the GPL.
+ *
+ * Kcopyd provides a simple interface for copying an area of one
+ * block-device to one or more other block-devices, with an asynchronous
+ * completion notification.
+ */
+#ifndef DM_KCOPYD_H
+#define DM_KCOPYD_H
+#include "dm-io.h"
+/* FIXME: make this configurable */
+#define KCOPYD_MAX_REGIONS 8
+#define KCOPYD_IGNORE_ERROR 1
+/*
+ * To use kcopyd you must first create a kcopyd client object.
+ */
+struct kcopyd_client;
+int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
+void kcopyd_client_destroy(struct kcopyd_client *kc);
+/*
+ * Submit a copy job to kcopyd.  This is built on top of the
+ * previous three fns.
+ *
+ * read_err is a boolean,
+ * write_err is a bitset, with 1 bit for each destination region
+ */
+typedef void (*kcopyd_notify_fn)(int read_err,
+                                 unsigned int write_err, void *context);
+int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
+                unsigned int num_dests, struct io_region *dests,
+                unsigned int flags, kcopyd_notify_fn fn, void *context);
+#endif
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
new file mode 100644
index 000000000000..161e9aa87291
--- /dev/null
+++ b/drivers/md/linear.c
@@ -0,0 +1,343 @@
+/*
+   linear.c : Multiple Devices driver for Linux
+              Copyright (C) 1994-96 Marc ZYNGIER
+              <zyngier@ufr-info-p7.ibp.fr> or
+              <maz@gloups.fdn.fr>
+   Linear mode management functions.
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+#include <linux/module.h>
+#include <linux/raid/md.h>
+#include <linux/slab.h>
+#include <linux/raid/linear.h>
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define MD_PERSONALITY
+/*
+ * find which device holds a particular offset 
+ */
+static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
+{
+        dev_info_t *hash;
+        linear_conf_t *conf = mddev_to_conf(mddev);
+        sector_t block = sector >> 1;
+        /*
+         * sector_div(a,b) returns the remainer and sets a to a/b
+         */
+        (void)sector_div(block, conf->smallest->size);
+        hash = conf->hash_table[block];
+        while ((sector>>1) >= (hash->size + hash->offset))
+                hash++;
+        return hash;
+}
+/**
+ *      linear_mergeable_bvec -- tell bio layer if a two requests can be merged
+ *      @q: request queue
+ *      @bio: the buffer head that's been built up so far
+ *      @biovec: the request that could be merged to it.
+ *
+ *      Return amount of bytes we can take at this offset
+ */
+static int linear_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
+{
+        mddev_t *mddev = q->queuedata;
+        dev_info_t *dev0;
+        unsigned long maxsectors, bio_sectors = bio->bi_size >> 9;
+        sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+        dev0 = which_dev(mddev, sector);
+        maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1));
+        if (maxsectors < bio_sectors)
+                maxsectors = 0;
+        else
+                maxsectors -= bio_sectors;
+        if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
+                return biovec->bv_len;
+        /* The bytes available at this offset could be really big,
+         * so we cap at 2^31 to avoid overflow */
+        if (maxsectors > (1 << (31-9)))
+                return 1<<31;
+        return maxsectors << 9;
+}
+static void linear_unplug(request_queue_t *q)
+{
+        mddev_t *mddev = q->queuedata;
+        linear_conf_t *conf = mddev_to_conf(mddev);
+        int i;
+        for (i=0; i < mddev->raid_disks; i++) {
+                request_queue_t *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
+                if (r_queue->unplug_fn)
+                        r_queue->unplug_fn(r_queue);
+        }
+}
+static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
+                              sector_t *error_sector)
+{
+        mddev_t *mddev = q->queuedata;
+        linear_conf_t *conf = mddev_to_conf(mddev);
+        int i, ret = 0;
+        for (i=0; i < mddev->raid_disks && ret == 0; i++) {
+                struct block_device *bdev = conf->disks[i].rdev->bdev;
+                request_queue_t *r_queue = bdev_get_queue(bdev);
+                if (!r_queue->issue_flush_fn)
+                        ret = -EOPNOTSUPP;
+                else
+                        ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+        }
+        return ret;
+}
+static int linear_run (mddev_t *mddev)
+{
+        linear_conf_t *conf;
+        dev_info_t **table;
+        mdk_rdev_t *rdev;
+        int i, nb_zone, cnt;
+        sector_t start;
+        sector_t curr_offset;
+        struct list_head *tmp;
+        conf = kmalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t),
+                        GFP_KERNEL);
+        if (!conf)
+                goto out;
+        memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t));
+        mddev->private = conf;
+        /*
+         * Find the smallest device.
+         */
+        conf->smallest = NULL;
+        cnt = 0;
+        mddev->array_size = 0;
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                int j = rdev->raid_disk;
+                dev_info_t *disk = conf->disks + j;
+                if (j < 0 || j > mddev->raid_disks || disk->rdev) {
+                        printk("linear: disk numbering problem. Aborting!\n");
+                        goto out;
+                }
+                disk->rdev = rdev;
+                blk_queue_stack_limits(mddev->queue,
+                                       rdev->bdev->bd_disk->queue);
+                /* as we don't honour merge_bvec_fn, we must never risk
+                 * violating it, so limit ->max_sector to one PAGE, as
+                 * a one page request is never in violation.
+                 */
+                if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+                disk->size = rdev->size;
+                mddev->array_size += rdev->size;
+                if (!conf->smallest || (disk->size < conf->smallest->size))
+                        conf->smallest = disk;
+                cnt++;
+        }
+        if (cnt != mddev->raid_disks) {
+                printk("linear: not enough drives present. Aborting!\n");
+                goto out;
+        }
+        /*
+         * This code was restructured to work around a gcc-2.95.3 internal
+         * compiler error.  Alter it with care.
+         */
+        {
+                sector_t sz;
+                unsigned round;
+                unsigned long base;
+                sz = mddev->array_size;
+                base = conf->smallest->size;
+                round = sector_div(sz, base);
+                nb_zone = conf->nr_zones = sz + (round ? 1 : 0);
+        }
+                        
+        conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone,
+                                        GFP_KERNEL);
+        if (!conf->hash_table)
+                goto out;
+        /*
+         * Here we generate the linear hash table
+         */
+        table = conf->hash_table;
+        start = 0;
+        curr_offset = 0;
+        for (i = 0; i < cnt; i++) {
+                dev_info_t *disk = conf->disks + i;
+                disk->offset = curr_offset;
+                curr_offset += disk->size;
+                /* 'curr_offset' is the end of this disk
+                 * 'start' is the start of table
+                 */
+                while (start < curr_offset) {
+                        *table++ = disk;
+                        start += conf->smallest->size;
+                }
+        }
+        if (table-conf->hash_table != nb_zone)
+                BUG();
+        blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
+        mddev->queue->unplug_fn = linear_unplug;
+        mddev->queue->issue_flush_fn = linear_issue_flush;
+        return 0;
+out:
+        if (conf)
+                kfree(conf);
+        return 1;
+}
+static int linear_stop (mddev_t *mddev)
+{
+        linear_conf_t *conf = mddev_to_conf(mddev);
+  
+        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+        kfree(conf->hash_table);
+        kfree(conf);
+        return 0;
+}
+static int linear_make_request (request_queue_t *q, struct bio *bio)
+{
+        mddev_t *mddev = q->queuedata;
+        dev_info_t *tmp_dev;
+        sector_t block;
+        if (bio_data_dir(bio)==WRITE) {
+                disk_stat_inc(mddev->gendisk, writes);
+                disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
+        } else {
+                disk_stat_inc(mddev->gendisk, reads);
+                disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
+        }
+        tmp_dev = which_dev(mddev, bio->bi_sector);
+        block = bio->bi_sector >> 1;
+    
+        if (unlikely(block >= (tmp_dev->size + tmp_dev->offset)
+                     || block < tmp_dev->offset)) {
+                char b[BDEVNAME_SIZE];
+                printk("linear_make_request: Block %llu out of bounds on "
+                        "dev %s size %llu offset %llu\n",
+                        (unsigned long long)block,
+                        bdevname(tmp_dev->rdev->bdev, b),
+                        (unsigned long long)tmp_dev->size,
+                        (unsigned long long)tmp_dev->offset);
+                bio_io_error(bio, bio->bi_size);
+                return 0;
+        }
+        if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
+                     (tmp_dev->offset + tmp_dev->size)<<1)) {
+                /* This bio crosses a device boundary, so we have to
+                 * split it.
+                 */
+                struct bio_pair *bp;
+                bp = bio_split(bio, bio_split_pool, 
+                               (bio->bi_sector + (bio->bi_size >> 9) -
+                                (tmp_dev->offset + tmp_dev->size))<<1);
+                if (linear_make_request(q, &bp->bio1))
+                        generic_make_request(&bp->bio1);
+                if (linear_make_request(q, &bp->bio2))
+                        generic_make_request(&bp->bio2);
+                bio_pair_release(bp);
+                return 0;
+        }
+                    
+        bio->bi_bdev = tmp_dev->rdev->bdev;
+        bio->bi_sector = bio->bi_sector - (tmp_dev->offset << 1) + tmp_dev->rdev->data_offset;
+        return 1;
+}
+static void linear_status (struct seq_file *seq, mddev_t *mddev)
+{
+#undef MD_DEBUG
+#ifdef MD_DEBUG
+        int j;
+        linear_conf_t *conf = mddev_to_conf(mddev);
+        sector_t s = 0;
+  
+        seq_printf(seq, "      ");
+        for (j = 0; j < conf->nr_zones; j++)
+        {
+                char b[BDEVNAME_SIZE];
+                s += conf->smallest_size;
+                seq_printf(seq, "[%s",
+                           bdevname(conf->hash_table[j][0].rdev->bdev,b));
+                while (s > conf->hash_table[j][0].offset +
+                           conf->hash_table[j][0].size)
+                        seq_printf(seq, "/%s] ",
+                                   bdevname(conf->hash_table[j][1].rdev->bdev,b));
+                else
+                        seq_printf(seq, "] ");
+        }
+        seq_printf(seq, "\n");
+#endif
+        seq_printf(seq, " %dk rounding", mddev->chunk_size/1024);
+}
+static mdk_personality_t linear_personality=
+{
+        .name           = "linear",
+        .owner          = THIS_MODULE,
+        .make_request   = linear_make_request,
+        .run            = linear_run,
+        .stop           = linear_stop,
+        .status         = linear_status,
+};
+static int __init linear_init (void)
+{
+        return register_md_personality (LINEAR, &linear_personality);
+}
+static void linear_exit (void)
+{
+        unregister_md_personality (LINEAR);
+}
+module_init(linear_init);
+module_exit(linear_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-1"); /* LINEAR */
diff --git a/drivers/md/md.c b/drivers/md/md.c
new file mode 100644
index 000000000000..04562add1920
--- /dev/null
+++ b/drivers/md/md.c
@@ -0,0 +1,3766 @@
+/*
+   md.c : Multiple Devices driver for Linux
+          Copyright (C) 1998, 1999, 2000 Ingo Molnar
+     completely rewritten, based on the MD driver code from Marc Zyngier
+   Changes:
+   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
+   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+   - kmod support by: Cyrus Durgin
+   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+   - lots of fixes and improvements to the RAID1/RAID5 and generic
+     RAID code (such as request based resynchronization):
+     Neil Brown <neilb@cse.unsw.edu.au>.
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <linux/raid/md.h>
+#include <linux/sysctl.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/buffer_head.h> /* for invalidate_bdev */
+#include <linux/suspend.h>
+#include <linux/init.h>
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+#include <asm/unaligned.h>
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+/* 63 partitions with the alternate major number (mdp) */
+#define MdpMinorShift 6
+#define DEBUG 0
+#define dprintk(x...) ((void)(DEBUG && printk(x)))
+#ifndef MODULE
+static void autostart_arrays (int part);
+#endif
+static mdk_personality_t *pers[MAX_PERSONALITY];
+static DEFINE_SPINLOCK(pers_lock);
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 1000 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ */
+static int sysctl_speed_limit_min = 1000;
+static int sysctl_speed_limit_max = 200000;
+static struct ctl_table_header *raid_table_header;
+static ctl_table raid_table[] = {
+        {
+                .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
+                .procname       = "speed_limit_min",
+                .data           = &sysctl_speed_limit_min,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
+                .procname       = "speed_limit_max",
+                .data           = &sysctl_speed_limit_max,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table raid_dir_table[] = {
+        {
+                .ctl_name       = DEV_RAID,
+                .procname       = "raid",
+                .maxlen         = 0,
+                .mode           = 0555,
+                .child          = raid_table,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table raid_root_table[] = {
+        {
+                .ctl_name       = CTL_DEV,
+                .procname       = "dev",
+                .maxlen         = 0,
+                .mode           = 0555,
+                .child          = raid_dir_table,
+        },
+        { .ctl_name = 0 }
+};
+static struct block_device_operations md_fops;
+/*
+ * Enables to iterate over all existing md arrays
+ * all_mddevs_lock protects this list.
+ */
+static LIST_HEAD(all_mddevs);
+static DEFINE_SPINLOCK(all_mddevs_lock);
+/*
+ * iterates through all used mddevs in the system.
+ * We take care to grab the all_mddevs_lock whenever navigating
+ * the list, and to always hold a refcount when unlocked.
+ * Any code which breaks out of this loop while own
+ * a reference to the current mddev and must mddev_put it.
+ */
+#define ITERATE_MDDEV(mddev,tmp)                                        \
+                                                                        \
+        for (({ spin_lock(&all_mddevs_lock);                            \
+                tmp = all_mddevs.next;                                  \
+                mddev = NULL;});                                        \
+             ({ if (tmp != &all_mddevs)                                 \
+                        mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
+                spin_unlock(&all_mddevs_lock);                          \
+                if (mddev) mddev_put(mddev);                            \
+                mddev = list_entry(tmp, mddev_t, all_mddevs);           \
+                tmp != &all_mddevs;});                                  \
+             ({ spin_lock(&all_mddevs_lock);                            \
+                tmp = tmp->next;})                                      \
+                )
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+        bio_io_error(bio, bio->bi_size);
+        return 0;
+}
+static inline mddev_t *mddev_get(mddev_t *mddev)
+{
+        atomic_inc(&mddev->active);
+        return mddev;
+}
+static void mddev_put(mddev_t *mddev)
+{
+        if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
+                return;
+        if (!mddev->raid_disks && list_empty(&mddev->disks)) {
+                list_del(&mddev->all_mddevs);
+                blk_put_queue(mddev->queue);
+                kfree(mddev);
+        }
+        spin_unlock(&all_mddevs_lock);
+}
+static mddev_t * mddev_find(dev_t unit)
+{
+        mddev_t *mddev, *new = NULL;
+ retry:
+        spin_lock(&all_mddevs_lock);
+        list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+                if (mddev->unit == unit) {
+                        mddev_get(mddev);
+                        spin_unlock(&all_mddevs_lock);
+                        if (new)
+                                kfree(new);
+                        return mddev;
+                }
+        if (new) {
+                list_add(&new->all_mddevs, &all_mddevs);
+                spin_unlock(&all_mddevs_lock);
+                return new;
+        }
+        spin_unlock(&all_mddevs_lock);
+        new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
+        if (!new)
+                return NULL;
+        memset(new, 0, sizeof(*new));
+        new->unit = unit;
+        if (MAJOR(unit) == MD_MAJOR)
+                new->md_minor = MINOR(unit);
+        else
+                new->md_minor = MINOR(unit) >> MdpMinorShift;
+        init_MUTEX(&new->reconfig_sem);
+        INIT_LIST_HEAD(&new->disks);
+        INIT_LIST_HEAD(&new->all_mddevs);
+        init_timer(&new->safemode_timer);
+        atomic_set(&new->active, 1);
+        new->queue = blk_alloc_queue(GFP_KERNEL);
+        if (!new->queue) {
+                kfree(new);
+                return NULL;
+        }
+        blk_queue_make_request(new->queue, md_fail_request);
+        goto retry;
+}
+static inline int mddev_lock(mddev_t * mddev)
+{
+        return down_interruptible(&mddev->reconfig_sem);
+}
+static inline void mddev_lock_uninterruptible(mddev_t * mddev)
+{
+        down(&mddev->reconfig_sem);
+}
+static inline int mddev_trylock(mddev_t * mddev)
+{
+        return down_trylock(&mddev->reconfig_sem);
+}
+static inline void mddev_unlock(mddev_t * mddev)
+{
+        up(&mddev->reconfig_sem);
+        if (mddev->thread)
+                md_wakeup_thread(mddev->thread);
+}
+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+        mdk_rdev_t * rdev;
+        struct list_head *tmp;
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                if (rdev->desc_nr == nr)
+                        return rdev;
+        }
+        return NULL;
+}
+static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
+{
+        struct list_head *tmp;
+        mdk_rdev_t *rdev;
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                if (rdev->bdev->bd_dev == dev)
+                        return rdev;
+        }
+        return NULL;
+}
+inline static sector_t calc_dev_sboffset(struct block_device *bdev)
+{
+        sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+        return MD_NEW_SIZE_BLOCKS(size);
+}
+static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
+{
+        sector_t size;
+        size = rdev->sb_offset;
+        if (chunk_size)
+                size &= ~((sector_t)chunk_size/1024 - 1);
+        return size;
+}
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+        if (rdev->sb_page)
+                MD_BUG();
+        rdev->sb_page = alloc_page(GFP_KERNEL);
+        if (!rdev->sb_page) {
+                printk(KERN_ALERT "md: out of memory.\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+        if (rdev->sb_page) {
+                page_cache_release(rdev->sb_page);
+                rdev->sb_loaded = 0;
+                rdev->sb_page = NULL;
+                rdev->sb_offset = 0;
+                rdev->size = 0;
+        }
+}
+static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
+{
+        if (bio->bi_size)
+                return 1;
+        complete((struct completion*)bio->bi_private);
+        return 0;
+}
+static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+                   struct page *page, int rw)
+{
+        struct bio *bio = bio_alloc(GFP_KERNEL, 1);
+        struct completion event;
+        int ret;
+        rw |= (1 << BIO_RW_SYNC);
+        bio->bi_bdev = bdev;
+        bio->bi_sector = sector;
+        bio_add_page(bio, page, size, 0);
+        init_completion(&event);
+        bio->bi_private = &event;
+        bio->bi_end_io = bi_complete;
+        submit_bio(rw, bio);
+        wait_for_completion(&event);
+        ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        bio_put(bio);
+        return ret;
+}
+static int read_disk_sb(mdk_rdev_t * rdev)
+{
+        char b[BDEVNAME_SIZE];
+        if (!rdev->sb_page) {
+                MD_BUG();
+                return -EINVAL;
+        }
+        if (rdev->sb_loaded)
+                return 0;
+        if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
+                goto fail;
+        rdev->sb_loaded = 1;
+        return 0;
+fail:
+        printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
+                bdevname(rdev->bdev,b));
+        return -EINVAL;
+}
+static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+        if (    (sb1->set_uuid0 == sb2->set_uuid0) &&
+                (sb1->set_uuid1 == sb2->set_uuid1) &&
+                (sb1->set_uuid2 == sb2->set_uuid2) &&
+                (sb1->set_uuid3 == sb2->set_uuid3))
+                return 1;
+        return 0;
+}
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+        int ret;
+        mdp_super_t *tmp1, *tmp2;
+        tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+        tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+        if (!tmp1 || !tmp2) {
+                ret = 0;
+                printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+                goto abort;
+        }
+        *tmp1 = *sb1;
+        *tmp2 = *sb2;
+        /*
+         * nr_disks is not constant
+         */
+        tmp1->nr_disks = 0;
+        tmp2->nr_disks = 0;
+        if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+                ret = 0;
+        else
+                ret = 1;
+abort:
+        if (tmp1)
+                kfree(tmp1);
+        if (tmp2)
+                kfree(tmp2);
+        return ret;
+}
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+        unsigned int disk_csum, csum;
+        disk_csum = sb->sb_csum;
+        sb->sb_csum = 0;
+        csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+        sb->sb_csum = disk_csum;
+        return csum;
+}
+/*
+ * Handle superblock details.
+ * We want to be able to handle multiple superblock formats
+ * so we have a common interface to them all, and an array of
+ * different handlers.
+ * We rely on user-space to write the initial superblock, and support
+ * reading and updating of superblocks.
+ * Interface methods are:
+ *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
+ *      loads and validates a superblock on dev.
+ *      if refdev != NULL, compare superblocks on both devices
+ *    Return:
+ *      0 - dev has a superblock that is compatible with refdev
+ *      1 - dev has a superblock that is compatible and newer than refdev
+ *          so dev should be used as the refdev in future
+ *     -EINVAL superblock incompatible or invalid
+ *     -othererror e.g. -EIO
+ *
+ *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
+ *      Verify that dev is acceptable into mddev.
+ *       The first time, mddev->raid_disks will be 0, and data from
+ *       dev should be merged in.  Subsequent calls check that dev
+ *       is new enough.  Return 0 or -EINVAL
+ *
+ *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
+ *     Update the superblock for rdev with data in mddev
+ *     This does not write to disc.
+ *
+ */
+struct super_type  {
+        char            *name;
+        struct module   *owner;
+        int             (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
+        int             (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+        void            (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+};
+/*
+ * load_super for 0.90.0 
+ */
+static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+        mdp_super_t *sb;
+        int ret;
+        sector_t sb_offset;
+        /*
+         * Calculate the position of the superblock,
+         * it's at the end of the disk.
+         *
+         * It also happens to be a multiple of 4Kb.
+         */
+        sb_offset = calc_dev_sboffset(rdev->bdev);
+        rdev->sb_offset = sb_offset;
+        ret = read_disk_sb(rdev);
+        if (ret) return ret;
+        ret = -EINVAL;
+        bdevname(rdev->bdev, b);
+        sb = (mdp_super_t*)page_address(rdev->sb_page);
+        if (sb->md_magic != MD_SB_MAGIC) {
+                printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
+                       b);
+                goto abort;
+        }
+        if (sb->major_version != 0 ||
+            sb->minor_version != 90) {
+                printk(KERN_WARNING "Bad version number %d.%d on %s\n",
+                        sb->major_version, sb->minor_version,
+                        b);
+                goto abort;
+        }
+        if (sb->raid_disks <= 0)
+                goto abort;
+        if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
+                printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
+                        b);
+                goto abort;
+        }
+        rdev->preferred_minor = sb->md_minor;
+        rdev->data_offset = 0;
+        if (sb->level == LEVEL_MULTIPATH)
+                rdev->desc_nr = -1;
+        else
+                rdev->desc_nr = sb->this_disk.number;
+        if (refdev == 0)
+                ret = 1;
+        else {
+                __u64 ev1, ev2;
+                mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
+                if (!uuid_equal(refsb, sb)) {
+                        printk(KERN_WARNING "md: %s has different UUID to %s\n",
+                                b, bdevname(refdev->bdev,b2));
+                        goto abort;
+                }
+                if (!sb_equal(refsb, sb)) {
+                        printk(KERN_WARNING "md: %s has same UUID"
+                               " but different superblock to %s\n",
+                               b, bdevname(refdev->bdev, b2));
+                        goto abort;
+                }
+                ev1 = md_event(sb);
+                ev2 = md_event(refsb);
+                if (ev1 > ev2)
+                        ret = 1;
+                else 
+                        ret = 0;
+        }
+        rdev->size = calc_dev_size(rdev, sb->chunk_size);
+ abort:
+        return ret;
+}
+/*
+ * validate_super for 0.90.0
+ */
+static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        mdp_disk_t *desc;
+        mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+        if (mddev->raid_disks == 0) {
+                mddev->major_version = 0;
+                mddev->minor_version = sb->minor_version;
+                mddev->patch_version = sb->patch_version;
+                mddev->persistent = ! sb->not_persistent;
+                mddev->chunk_size = sb->chunk_size;
+                mddev->ctime = sb->ctime;
+                mddev->utime = sb->utime;
+                mddev->level = sb->level;
+                mddev->layout = sb->layout;
+                mddev->raid_disks = sb->raid_disks;
+                mddev->size = sb->size;
+                mddev->events = md_event(sb);
+                if (sb->state & (1<<MD_SB_CLEAN))
+                        mddev->recovery_cp = MaxSector;
+                else {
+                        if (sb->events_hi == sb->cp_events_hi && 
+                                sb->events_lo == sb->cp_events_lo) {
+                                mddev->recovery_cp = sb->recovery_cp;
+                        } else
+                                mddev->recovery_cp = 0;
+                }
+                memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
+                memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
+                memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
+                memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
+                mddev->max_disks = MD_SB_DISKS;
+        } else {
+                __u64 ev1;
+                ev1 = md_event(sb);
+                ++ev1;
+                if (ev1 < mddev->events) 
+                        return -EINVAL;
+        }
+        if (mddev->level != LEVEL_MULTIPATH) {
+                rdev->raid_disk = -1;
+                rdev->in_sync = rdev->faulty = 0;
+                desc = sb->disks + rdev->desc_nr;
+                if (desc->state & (1<<MD_DISK_FAULTY))
+                        rdev->faulty = 1;
+                else if (desc->state & (1<<MD_DISK_SYNC) &&
+                         desc->raid_disk < mddev->raid_disks) {
+                        rdev->in_sync = 1;
+                        rdev->raid_disk = desc->raid_disk;
+                }
+        }
+        return 0;
+}
+/*
+ * sync_super for 0.90.0
+ */
+static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        mdp_super_t *sb;
+        struct list_head *tmp;
+        mdk_rdev_t *rdev2;
+        int next_spare = mddev->raid_disks;
+        /* make rdev->sb match mddev data..
+         *
+         * 1/ zero out disks
+         * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
+         * 3/ any empty disks < next_spare become removed
+         *
+         * disks[0] gets initialised to REMOVED because
+         * we cannot be sure from other fields if it has
+         * been initialised or not.
+         */
+        int i;
+        int active=0, working=0,failed=0,spare=0,nr_disks=0;
+        sb = (mdp_super_t*)page_address(rdev->sb_page);
+        memset(sb, 0, sizeof(*sb));
+        sb->md_magic = MD_SB_MAGIC;
+        sb->major_version = mddev->major_version;
+        sb->minor_version = mddev->minor_version;
+        sb->patch_version = mddev->patch_version;
+        sb->gvalid_words  = 0; /* ignored */
+        memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
+        memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
+        memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
+        memcpy(&sb->set_uuid3, mddev->uuid+12,4);
+        sb->ctime = mddev->ctime;
+        sb->level = mddev->level;
+        sb->size  = mddev->size;
+        sb->raid_disks = mddev->raid_disks;
+        sb->md_minor = mddev->md_minor;
+        sb->not_persistent = !mddev->persistent;
+        sb->utime = mddev->utime;
+        sb->state = 0;
+        sb->events_hi = (mddev->events>>32);
+        sb->events_lo = (u32)mddev->events;
+        if (mddev->in_sync)
+        {
+                sb->recovery_cp = mddev->recovery_cp;
+                sb->cp_events_hi = (mddev->events>>32);
+                sb->cp_events_lo = (u32)mddev->events;
+                if (mddev->recovery_cp == MaxSector)
+                        sb->state = (1<< MD_SB_CLEAN);
+        } else
+                sb->recovery_cp = 0;
+        sb->layout = mddev->layout;
+        sb->chunk_size = mddev->chunk_size;
+        sb->disks[0].state = (1<<MD_DISK_REMOVED);
+        ITERATE_RDEV(mddev,rdev2,tmp) {
+                mdp_disk_t *d;
+                if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty)
+                        rdev2->desc_nr = rdev2->raid_disk;
+                else
+                        rdev2->desc_nr = next_spare++;
+                d = &sb->disks[rdev2->desc_nr];
+                nr_disks++;
+                d->number = rdev2->desc_nr;
+                d->major = MAJOR(rdev2->bdev->bd_dev);
+                d->minor = MINOR(rdev2->bdev->bd_dev);
+                if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty)
+                        d->raid_disk = rdev2->raid_disk;
+                else
+                        d->raid_disk = rdev2->desc_nr; /* compatibility */
+                if (rdev2->faulty) {
+                        d->state = (1<<MD_DISK_FAULTY);
+                        failed++;
+                } else if (rdev2->in_sync) {
+                        d->state = (1<<MD_DISK_ACTIVE);
+                        d->state |= (1<<MD_DISK_SYNC);
+                        active++;
+                        working++;
+                } else {
+                        d->state = 0;
+                        spare++;
+                        working++;
+                }
+        }
+        
+        /* now set the "removed" and "faulty" bits on any missing devices */
+        for (i=0 ; i < mddev->raid_disks ; i++) {
+                mdp_disk_t *d = &sb->disks[i];
+                if (d->state == 0 && d->number == 0) {
+                        d->number = i;
+                        d->raid_disk = i;
+                        d->state = (1<<MD_DISK_REMOVED);
+                        d->state |= (1<<MD_DISK_FAULTY);
+                        failed++;
+                }
+        }
+        sb->nr_disks = nr_disks;
+        sb->active_disks = active;
+        sb->working_disks = working;
+        sb->failed_disks = failed;
+        sb->spare_disks = spare;
+        sb->this_disk = sb->disks[rdev->desc_nr];
+        sb->sb_csum = calc_sb_csum(sb);
+}
+/*
+ * version 1 superblock
+ */
+static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
+{
+        unsigned int disk_csum, csum;
+        unsigned long long newcsum;
+        int size = 256 + le32_to_cpu(sb->max_dev)*2;
+        unsigned int *isuper = (unsigned int*)sb;
+        int i;
+        disk_csum = sb->sb_csum;
+        sb->sb_csum = 0;
+        newcsum = 0;
+        for (i=0; size>=4; size -= 4 )
+                newcsum += le32_to_cpu(*isuper++);
+        if (size == 2)
+                newcsum += le16_to_cpu(*(unsigned short*) isuper);
+        csum = (newcsum & 0xffffffff) + (newcsum >> 32);
+        sb->sb_csum = disk_csum;
+        return cpu_to_le32(csum);
+}
+static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+        struct mdp_superblock_1 *sb;
+        int ret;
+        sector_t sb_offset;
+        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+        /*
+         * Calculate the position of the superblock.
+         * It is always aligned to a 4K boundary and
+         * depeding on minor_version, it can be:
+         * 0: At least 8K, but less than 12K, from end of device
+         * 1: At start of device
+         * 2: 4K from start of device.
+         */
+        switch(minor_version) {
+        case 0:
+                sb_offset = rdev->bdev->bd_inode->i_size >> 9;
+                sb_offset -= 8*2;
+                sb_offset &= ~(4*2-1);
+                /* convert from sectors to K */
+                sb_offset /= 2;
+                break;
+        case 1:
+                sb_offset = 0;
+                break;
+        case 2:
+                sb_offset = 4;
+                break;
+        default:
+                return -EINVAL;
+        }
+        rdev->sb_offset = sb_offset;
+        ret = read_disk_sb(rdev);
+        if (ret) return ret;
+        sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+        if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
+            sb->major_version != cpu_to_le32(1) ||
+            le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
+            le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
+            sb->feature_map != 0)
+                return -EINVAL;
+        if (calc_sb_1_csum(sb) != sb->sb_csum) {
+                printk("md: invalid superblock checksum on %s\n",
+                        bdevname(rdev->bdev,b));
+                return -EINVAL;
+        }
+        if (le64_to_cpu(sb->data_size) < 10) {
+                printk("md: data_size too small on %s\n",
+                       bdevname(rdev->bdev,b));
+                return -EINVAL;
+        }
+        rdev->preferred_minor = 0xffff;
+        rdev->data_offset = le64_to_cpu(sb->data_offset);
+        if (refdev == 0)
+                return 1;
+        else {
+                __u64 ev1, ev2;
+                struct mdp_superblock_1 *refsb = 
+                        (struct mdp_superblock_1*)page_address(refdev->sb_page);
+                if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
+                    sb->level != refsb->level ||
+                    sb->layout != refsb->layout ||
+                    sb->chunksize != refsb->chunksize) {
+                        printk(KERN_WARNING "md: %s has strangely different"
+                                " superblock to %s\n",
+                                bdevname(rdev->bdev,b),
+                                bdevname(refdev->bdev,b2));
+                        return -EINVAL;
+                }
+                ev1 = le64_to_cpu(sb->events);
+                ev2 = le64_to_cpu(refsb->events);
+                if (ev1 > ev2)
+                        return 1;
+        }
+        if (minor_version) 
+                rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+        else
+                rdev->size = rdev->sb_offset;
+        if (rdev->size < le64_to_cpu(sb->data_size)/2)
+                return -EINVAL;
+        rdev->size = le64_to_cpu(sb->data_size)/2;
+        if (le32_to_cpu(sb->chunksize))
+                rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+        return 0;
+}
+static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+        if (mddev->raid_disks == 0) {
+                mddev->major_version = 1;
+                mddev->patch_version = 0;
+                mddev->persistent = 1;
+                mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
+                mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
+                mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+                mddev->level = le32_to_cpu(sb->level);
+                mddev->layout = le32_to_cpu(sb->layout);
+                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
+                mddev->size = le64_to_cpu(sb->size)/2;
+                mddev->events = le64_to_cpu(sb->events);
+                
+                mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+                memcpy(mddev->uuid, sb->set_uuid, 16);
+                mddev->max_disks =  (4096-256)/2;
+        } else {
+                __u64 ev1;
+                ev1 = le64_to_cpu(sb->events);
+                ++ev1;
+                if (ev1 < mddev->events)
+                        return -EINVAL;
+        }
+        if (mddev->level != LEVEL_MULTIPATH) {
+                int role;
+                rdev->desc_nr = le32_to_cpu(sb->dev_number);
+                role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+                switch(role) {
+                case 0xffff: /* spare */
+                        rdev->in_sync = 0;
+                        rdev->faulty = 0;
+                        rdev->raid_disk = -1;
+                        break;
+                case 0xfffe: /* faulty */
+                        rdev->in_sync = 0;
+                        rdev->faulty = 1;
+                        rdev->raid_disk = -1;
+                        break;
+                default:
+                        rdev->in_sync = 1;
+                        rdev->faulty = 0;
+                        rdev->raid_disk = role;
+                        break;
+                }
+        }
+        return 0;
+}
+static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        struct mdp_superblock_1 *sb;
+        struct list_head *tmp;
+        mdk_rdev_t *rdev2;
+        int max_dev, i;
+        /* make rdev->sb match mddev and rdev data. */
+        sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+        sb->feature_map = 0;
+        sb->pad0 = 0;
+        memset(sb->pad1, 0, sizeof(sb->pad1));
+        memset(sb->pad2, 0, sizeof(sb->pad2));
+        memset(sb->pad3, 0, sizeof(sb->pad3));
+        sb->utime = cpu_to_le64((__u64)mddev->utime);
+        sb->events = cpu_to_le64(mddev->events);
+        if (mddev->in_sync)
+                sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+        else
+                sb->resync_offset = cpu_to_le64(0);
+        max_dev = 0;
+        ITERATE_RDEV(mddev,rdev2,tmp)
+                if (rdev2->desc_nr+1 > max_dev)
+                        max_dev = rdev2->desc_nr+1;
+        
+        sb->max_dev = cpu_to_le32(max_dev);
+        for (i=0; i<max_dev;i++)
+                sb->dev_roles[i] = cpu_to_le16(0xfffe);
+        
+        ITERATE_RDEV(mddev,rdev2,tmp) {
+                i = rdev2->desc_nr;
+                if (rdev2->faulty)
+                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
+                else if (rdev2->in_sync)
+                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+                else
+                        sb->dev_roles[i] = cpu_to_le16(0xffff);
+        }
+        sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
+        sb->sb_csum = calc_sb_1_csum(sb);
+}
+struct super_type super_types[] = {
+        [0] = {
+                .name   = "0.90.0",
+                .owner  = THIS_MODULE,
+                .load_super     = super_90_load,
+                .validate_super = super_90_validate,
+                .sync_super     = super_90_sync,
+        },
+        [1] = {
+                .name   = "md-1",
+                .owner  = THIS_MODULE,
+                .load_super     = super_1_load,
+                .validate_super = super_1_validate,
+                .sync_super     = super_1_sync,
+        },
+};
+        
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
+{
+        struct list_head *tmp;
+        mdk_rdev_t *rdev;
+        ITERATE_RDEV(mddev,rdev,tmp)
+                if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
+                        return rdev;
+        return NULL;
+}
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+        struct list_head *tmp;
+        mdk_rdev_t *rdev;
+        ITERATE_RDEV(mddev1,rdev,tmp)
+                if (match_dev_unit(mddev2, rdev))
+                        return 1;
+        return 0;
+}
+static LIST_HEAD(pending_raid_disks);
+static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+        mdk_rdev_t *same_pdev;
+        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+        if (rdev->mddev) {
+                MD_BUG();
+                return -EINVAL;
+        }
+        same_pdev = match_dev_unit(mddev, rdev);
+        if (same_pdev)
+                printk(KERN_WARNING
+                        "%s: WARNING: %s appears to be on the same physical"
+                        " disk as %s. True\n     protection against single-disk"
+                        " failure might be compromised.\n",
+                        mdname(mddev), bdevname(rdev->bdev,b),
+                        bdevname(same_pdev->bdev,b2));
+        /* Verify rdev->desc_nr is unique.
+         * If it is -1, assign a free number, else
+         * check number is not in use
+         */
+        if (rdev->desc_nr < 0) {
+                int choice = 0;
+                if (mddev->pers) choice = mddev->raid_disks;
+                while (find_rdev_nr(mddev, choice))
+                        choice++;
+                rdev->desc_nr = choice;
+        } else {
+                if (find_rdev_nr(mddev, rdev->desc_nr))
+                        return -EBUSY;
+        }
+                        
+        list_add(&rdev->same_set, &mddev->disks);
+        rdev->mddev = mddev;
+        printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b));
+        return 0;
+}
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+        char b[BDEVNAME_SIZE];
+        if (!rdev->mddev) {
+                MD_BUG();
+                return;
+        }
+        list_del_init(&rdev->same_set);
+        printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
+        rdev->mddev = NULL;
+}
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by bd_claiming the device.
+ */
+static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
+{
+        int err = 0;
+        struct block_device *bdev;
+        char b[BDEVNAME_SIZE];
+        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+        if (IS_ERR(bdev)) {
+                printk(KERN_ERR "md: could not open %s.\n",
+                        __bdevname(dev, b));
+                return PTR_ERR(bdev);
+        }
+        err = bd_claim(bdev, rdev);
+        if (err) {
+                printk(KERN_ERR "md: could not bd_claim %s.\n",
+                        bdevname(bdev, b));
+                blkdev_put(bdev);
+                return err;
+        }
+        rdev->bdev = bdev;
+        return err;
+}
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+        struct block_device *bdev = rdev->bdev;
+        rdev->bdev = NULL;
+        if (!bdev)
+                MD_BUG();
+        bd_release(bdev);
+        blkdev_put(bdev);
+}
+void md_autodetect_dev(dev_t dev);
+static void export_rdev(mdk_rdev_t * rdev)
+{
+        char b[BDEVNAME_SIZE];
+        printk(KERN_INFO "md: export_rdev(%s)\n",
+                bdevname(rdev->bdev,b));
+        if (rdev->mddev)
+                MD_BUG();
+        free_disk_sb(rdev);
+        list_del_init(&rdev->same_set);
+#ifndef MODULE
+        md_autodetect_dev(rdev->bdev->bd_dev);
+#endif
+        unlock_rdev(rdev);
+        kfree(rdev);
+}
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+        unbind_rdev_from_array(rdev);
+        export_rdev(rdev);
+}
+static void export_array(mddev_t *mddev)
+{
+        struct list_head *tmp;
+        mdk_rdev_t *rdev;
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                if (!rdev->mddev) {
+                        MD_BUG();
+                        continue;
+                }
+                kick_rdev_from_array(rdev);
+        }
+        if (!list_empty(&mddev->disks))
+                MD_BUG();
+        mddev->raid_disks = 0;
+        mddev->major_version = 0;
+}
+static void print_desc(mdp_disk_t *desc)
+{
+        printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
+                desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+static void print_sb(mdp_super_t *sb)
+{
+        int i;
+        printk(KERN_INFO 
+                "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+                sb->major_version, sb->minor_version, sb->patch_version,
+                sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+                sb->ctime);
+        printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
+                sb->level, sb->size, sb->nr_disks, sb->raid_disks,
+                sb->md_minor, sb->layout, sb->chunk_size);
+        printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
+                " FD:%d SD:%d CSUM:%08x E:%08lx\n",
+                sb->utime, sb->state, sb->active_disks, sb->working_disks,
+                sb->failed_disks, sb->spare_disks,
+                sb->sb_csum, (unsigned long)sb->events_lo);
+        printk(KERN_INFO);
+        for (i = 0; i < MD_SB_DISKS; i++) {
+                mdp_disk_t *desc;
+                desc = sb->disks + i;
+                if (desc->number || desc->major || desc->minor ||
+                    desc->raid_disk || (desc->state && (desc->state != 4))) {
+                        printk("     D %2d: ", i);
+                        print_desc(desc);
+                }
+        }
+        printk(KERN_INFO "md:     THIS: ");
+        print_desc(&sb->this_disk);
+}
+static void print_rdev(mdk_rdev_t *rdev)
+{
+        char b[BDEVNAME_SIZE];
+        printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
+                bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
+                rdev->faulty, rdev->in_sync, rdev->desc_nr);
+        if (rdev->sb_loaded) {
+                printk(KERN_INFO "md: rdev superblock:\n");
+                print_sb((mdp_super_t*)page_address(rdev->sb_page));
+        } else
+                printk(KERN_INFO "md: no rdev superblock!\n");
+}
+void md_print_devices(void)
+{
+        struct list_head *tmp, *tmp2;
+        mdk_rdev_t *rdev;
+        mddev_t *mddev;
+        char b[BDEVNAME_SIZE];
+        printk("\n");
+        printk("md:     **********************************\n");
+        printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
+        printk("md:     **********************************\n");
+        ITERATE_MDDEV(mddev,tmp) {
+                printk("%s: ", mdname(mddev));
+                ITERATE_RDEV(mddev,rdev,tmp2)
+                        printk("<%s>", bdevname(rdev->bdev,b));
+                printk("\n");
+                ITERATE_RDEV(mddev,rdev,tmp2)
+                        print_rdev(rdev);
+        }
+        printk("md:     **********************************\n");
+        printk("\n");
+}
+static int write_disk_sb(mdk_rdev_t * rdev)
+{
+        char b[BDEVNAME_SIZE];
+        if (!rdev->sb_loaded) {
+                MD_BUG();
+                return 1;
+        }
+        if (rdev->faulty) {
+                MD_BUG();
+                return 1;
+        }
+        dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
+                bdevname(rdev->bdev,b),
+               (unsigned long long)rdev->sb_offset);
+  
+        if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
+                return 0;
+        printk("md: write_disk_sb failed for device %s\n", 
+                bdevname(rdev->bdev,b));
+        return 1;
+}
+static void sync_sbs(mddev_t * mddev)
+{
+        mdk_rdev_t *rdev;
+        struct list_head *tmp;
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                super_types[mddev->major_version].
+                        sync_super(mddev, rdev);
+                rdev->sb_loaded = 1;
+        }
+}
+static void md_update_sb(mddev_t * mddev)
+{
+        int err, count = 100;
+        struct list_head *tmp;
+        mdk_rdev_t *rdev;
+        mddev->sb_dirty = 0;
+repeat:
+        mddev->utime = get_seconds();
+        mddev->events ++;
+        if (!mddev->events) {
+                /*
+                 * oops, this 64-bit counter should never wrap.
+                 * Either we are in around ~1 trillion A.C., assuming
+                 * 1 reboot per second, or we have a bug:
+                 */
+                MD_BUG();
+                mddev->events --;
+        }
+        sync_sbs(mddev);
+        /*
+         * do not write anything to disk if using
+         * nonpersistent superblocks
+         */
+        if (!mddev->persistent)
+                return;
+        dprintk(KERN_INFO 
+                "md: updating %s RAID superblock on device (in sync %d)\n",
+                mdname(mddev),mddev->in_sync);
+        err = 0;
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                char b[BDEVNAME_SIZE];
+                dprintk(KERN_INFO "md: ");
+                if (rdev->faulty)
+                        dprintk("(skipping faulty ");
+                dprintk("%s ", bdevname(rdev->bdev,b));
+                if (!rdev->faulty) {
+                        err += write_disk_sb(rdev);
+                } else
+                        dprintk(")\n");
+                if (!err && mddev->level == LEVEL_MULTIPATH)
+                        /* only need to write one superblock... */
+                        break;
+        }
+        if (err) {
+                if (--count) {
+                        printk(KERN_ERR "md: errors occurred during superblock"
+                                " update, repeating\n");
+                        goto repeat;
+                }
+                printk(KERN_ERR \
+                        "md: excessive errors occurred during superblock update, exiting\n");
+        }
+}
+/*
+ * Import a device. If 'super_format' >= 0, then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ *   - the device is nonexistent (zero size)
+ *   - the device has no valid superblock
+ *
+ * a faulty rdev _never_ has rdev->sb set.
+ */
+static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
+{
+        char b[BDEVNAME_SIZE];
+        int err;
+        mdk_rdev_t *rdev;
+        sector_t size;
+        rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+        if (!rdev) {
+                printk(KERN_ERR "md: could not alloc mem for new device!\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        memset(rdev, 0, sizeof(*rdev));
+        if ((err = alloc_disk_sb(rdev)))
+                goto abort_free;
+        err = lock_rdev(rdev, newdev);
+        if (err)
+                goto abort_free;
+        rdev->desc_nr = -1;
+        rdev->faulty = 0;
+        rdev->in_sync = 0;
+        rdev->data_offset = 0;
+        atomic_set(&rdev->nr_pending, 0);
+        size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+        if (!size) {
+                printk(KERN_WARNING 
+                        "md: %s has zero or unknown size, marking faulty!\n",
+                        bdevname(rdev->bdev,b));
+                err = -EINVAL;
+                goto abort_free;
+        }
+        if (super_format >= 0) {
+                err = super_types[super_format].
+                        load_super(rdev, NULL, super_minor);
+                if (err == -EINVAL) {
+                        printk(KERN_WARNING 
+                                "md: %s has invalid sb, not importing!\n",
+                                bdevname(rdev->bdev,b));
+                        goto abort_free;
+                }
+                if (err < 0) {
+                        printk(KERN_WARNING 
+                                "md: could not read %s's sb, not importing!\n",
+                                bdevname(rdev->bdev,b));
+                        goto abort_free;
+                }
+        }
+        INIT_LIST_HEAD(&rdev->same_set);
+        return rdev;
+abort_free:
+        if (rdev->sb_page) {
+                if (rdev->bdev)
+                        unlock_rdev(rdev);
+                free_disk_sb(rdev);
+        }
+        kfree(rdev);
+        return ERR_PTR(err);
+}
+/*
+ * Check a full RAID array for plausibility
+ */
+static int analyze_sbs(mddev_t * mddev)
+{
+        int i;
+        struct list_head *tmp;
+        mdk_rdev_t *rdev, *freshest;
+        char b[BDEVNAME_SIZE];
+        freshest = NULL;
+        ITERATE_RDEV(mddev,rdev,tmp)
+                switch (super_types[mddev->major_version].
+                        load_super(rdev, freshest, mddev->minor_version)) {
+                case 1:
+                        freshest = rdev;
+                        break;
+                case 0:
+                        break;
+                default:
+                        printk( KERN_ERR \
+                                "md: fatal superblock inconsistency in %s"
+                                " -- removing from array\n", 
+                                bdevname(rdev->bdev,b));
+                        kick_rdev_from_array(rdev);
+                }
+        super_types[mddev->major_version].
+                validate_super(mddev, freshest);
+        i = 0;
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                if (rdev != freshest)
+                        if (super_types[mddev->major_version].
+                            validate_super(mddev, rdev)) {
+                                printk(KERN_WARNING "md: kicking non-fresh %s"
+                                        " from array!\n",
+                                        bdevname(rdev->bdev,b));
+                                kick_rdev_from_array(rdev);
+                                continue;
+                        }
+                if (mddev->level == LEVEL_MULTIPATH) {
+                        rdev->desc_nr = i++;
+                        rdev->raid_disk = rdev->desc_nr;
+                        rdev->in_sync = 1;
+                }
+        }
+        if (mddev->recovery_cp != MaxSector &&
+            mddev->level >= 1)
+                printk(KERN_ERR "md: %s: raid array is not clean"
+                       " -- starting background reconstruction\n",
+                       mdname(mddev));
+        return 0;
+}
+int mdp_major = 0;
+static struct kobject *md_probe(dev_t dev, int *part, void *data)
+{
+        static DECLARE_MUTEX(disks_sem);
+        mddev_t *mddev = mddev_find(dev);
+        struct gendisk *disk;
+        int partitioned = (MAJOR(dev) != MD_MAJOR);
+        int shift = partitioned ? MdpMinorShift : 0;
+        int unit = MINOR(dev) >> shift;
+        if (!mddev)
+                return NULL;
+        down(&disks_sem);
+        if (mddev->gendisk) {
+                up(&disks_sem);
+                mddev_put(mddev);
+                return NULL;
+        }
+        disk = alloc_disk(1 << shift);
+        if (!disk) {
+                up(&disks_sem);
+                mddev_put(mddev);
+                return NULL;
+        }
+        disk->major = MAJOR(dev);
+        disk->first_minor = unit << shift;
+        if (partitioned) {
+                sprintf(disk->disk_name, "md_d%d", unit);
+                sprintf(disk->devfs_name, "md/d%d", unit);
+        } else {
+                sprintf(disk->disk_name, "md%d", unit);
+                sprintf(disk->devfs_name, "md/%d", unit);
+        }
+        disk->fops = &md_fops;
+        disk->private_data = mddev;
+        disk->queue = mddev->queue;
+        add_disk(disk);
+        mddev->gendisk = disk;
+        up(&disks_sem);
+        return NULL;
+}
+void md_wakeup_thread(mdk_thread_t *thread);
+static void md_safemode_timeout(unsigned long data)
+{
+        mddev_t *mddev = (mddev_t *) data;
+        mddev->safemode = 1;
+        md_wakeup_thread(mddev->thread);
+}
+static int do_md_run(mddev_t * mddev)
+{
+        int pnum, err;
+        int chunk_size;
+        struct list_head *tmp;
+        mdk_rdev_t *rdev;
+        struct gendisk *disk;
+        char b[BDEVNAME_SIZE];
+        if (list_empty(&mddev->disks)) {
+                MD_BUG();
+                return -EINVAL;
+        }
+        if (mddev->pers)
+                return -EBUSY;
+        /*
+         * Analyze all RAID superblock(s)
+         */
+        if (!mddev->raid_disks && analyze_sbs(mddev)) {
+                MD_BUG();
+                return -EINVAL;
+        }
+        chunk_size = mddev->chunk_size;
+        pnum = level_to_pers(mddev->level);
+        if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+                if (!chunk_size) {
+                        /*
+                         * 'default chunksize' in the old md code used to
+                         * be PAGE_SIZE, baaad.
+                         * we abort here to be on the safe side. We don't
+                         * want to continue the bad practice.
+                         */
+                        printk(KERN_ERR 
+                                "no chunksize specified, see 'man raidtab'\n");
+                        return -EINVAL;
+                }
+                if (chunk_size > MAX_CHUNK_SIZE) {
+                        printk(KERN_ERR "too big chunk_size: %d > %d\n",
+                                chunk_size, MAX_CHUNK_SIZE);
+                        return -EINVAL;
+                }
+                /*
+                 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+                 */
+                if ( (1 << ffz(~chunk_size)) != chunk_size) {
+                        MD_BUG();
+                        return -EINVAL;
+                }
+                if (chunk_size < PAGE_SIZE) {
+                        printk(KERN_ERR "too small chunk_size: %d < %ld\n",
+                                chunk_size, PAGE_SIZE);
+                        return -EINVAL;
+                }
+                /* devices must have minimum size of one chunk */
+                ITERATE_RDEV(mddev,rdev,tmp) {
+                        if (rdev->faulty)
+                                continue;
+                        if (rdev->size < chunk_size / 1024) {
+                                printk(KERN_WARNING
+                                        "md: Dev %s smaller than chunk_size:"
+                                        " %lluk < %dk\n",
+                                        bdevname(rdev->bdev,b),
+                                        (unsigned long long)rdev->size,
+                                        chunk_size / 1024);
+                                return -EINVAL;
+                        }
+                }
+        }
+        if (pnum >= MAX_PERSONALITY) {
+                MD_BUG();
+                return -EINVAL;
+        }
+#ifdef CONFIG_KMOD
+        if (!pers[pnum])
+        {
+                request_module("md-personality-%d", pnum);
+        }
+#endif
+        /*
+         * Drop all container device buffers, from now on
+         * the only valid external interface is through the md
+         * device.
+         * Also find largest hardsector size
+         */
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                if (rdev->faulty)
+                        continue;
+                sync_blockdev(rdev->bdev);
+                invalidate_bdev(rdev->bdev, 0);
+        }
+        md_probe(mddev->unit, NULL, NULL);
+        disk = mddev->gendisk;
+        if (!disk)
+                return -ENOMEM;
+        spin_lock(&pers_lock);
+        if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
+                spin_unlock(&pers_lock);
+                printk(KERN_WARNING "md: personality %d is not loaded!\n",
+                       pnum);
+                return -EINVAL;
+        }
+        mddev->pers = pers[pnum];
+        spin_unlock(&pers_lock);
+        mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
+        err = mddev->pers->run(mddev);
+        if (err) {
+                printk(KERN_ERR "md: pers->run() failed ...\n");
+                module_put(mddev->pers->owner);
+                mddev->pers = NULL;
+                return -EINVAL;
+        }
+        atomic_set(&mddev->writes_pending,0);
+        mddev->safemode = 0;
+        mddev->safemode_timer.function = md_safemode_timeout;
+        mddev->safemode_timer.data = (unsigned long) mddev;
+        mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
+        mddev->in_sync = 1;
+        
+        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+        
+        if (mddev->sb_dirty)
+                md_update_sb(mddev);
+        set_capacity(disk, mddev->array_size<<1);
+        /* If we call blk_queue_make_request here, it will
+         * re-initialise max_sectors etc which may have been
+         * refined inside -> run.  So just set the bits we need to set.
+         * Most initialisation happended when we called
+         * blk_queue_make_request(..., md_fail_request)
+         * earlier.
+         */
+        mddev->queue->queuedata = mddev;
+        mddev->queue->make_request_fn = mddev->pers->make_request;
+        mddev->changed = 1;
+        return 0;
+}
+static int restart_array(mddev_t *mddev)
+{
+        struct gendisk *disk = mddev->gendisk;
+        int err;
+        /*
+         * Complain if it has no devices
+         */
+        err = -ENXIO;
+        if (list_empty(&mddev->disks))
+                goto out;
+        if (mddev->pers) {
+                err = -EBUSY;
+                if (!mddev->ro)
+                        goto out;
+                mddev->safemode = 0;
+                mddev->ro = 0;
+                set_disk_ro(disk, 0);
+                printk(KERN_INFO "md: %s switched to read-write mode.\n",
+                        mdname(mddev));
+                /*
+                 * Kick recovery or resync if necessary
+                 */
+                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+                md_wakeup_thread(mddev->thread);
+                err = 0;
+        } else {
+                printk(KERN_ERR "md: %s has no personality assigned.\n",
+                        mdname(mddev));
+                err = -EINVAL;
+        }
+out:
+        return err;
+}
+static int do_md_stop(mddev_t * mddev, int ro)
+{
+        int err = 0;
+        struct gendisk *disk = mddev->gendisk;
+        if (mddev->pers) {
+                if (atomic_read(&mddev->active)>2) {
+                        printk("md: %s still in use.\n",mdname(mddev));
+                        return -EBUSY;
+                }
+                if (mddev->sync_thread) {
+                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+                        md_unregister_thread(mddev->sync_thread);
+                        mddev->sync_thread = NULL;
+                }
+                del_timer_sync(&mddev->safemode_timer);
+                invalidate_partition(disk, 0);
+                if (ro) {
+                        err  = -ENXIO;
+                        if (mddev->ro)
+                                goto out;
+                        mddev->ro = 1;
+                } else {
+                        if (mddev->ro)
+                                set_disk_ro(disk, 0);
+                        blk_queue_make_request(mddev->queue, md_fail_request);
+                        mddev->pers->stop(mddev);
+                        module_put(mddev->pers->owner);
+                        mddev->pers = NULL;
+                        if (mddev->ro)
+                                mddev->ro = 0;
+                }
+                if (!mddev->in_sync) {
+                        /* mark array as shutdown cleanly */
+                        mddev->in_sync = 1;
+                        md_update_sb(mddev);
+                }
+                if (ro)
+                        set_disk_ro(disk, 1);
+        }
+        /*
+         * Free resources if final stop
+         */
+        if (!ro) {
+                struct gendisk *disk;
+                printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
+                export_array(mddev);
+                mddev->array_size = 0;
+                disk = mddev->gendisk;
+                if (disk)
+                        set_capacity(disk, 0);
+                mddev->changed = 1;
+        } else
+                printk(KERN_INFO "md: %s switched to read-only mode.\n",
+                        mdname(mddev));
+        err = 0;
+out:
+        return err;
+}
+static void autorun_array(mddev_t *mddev)
+{
+        mdk_rdev_t *rdev;
+        struct list_head *tmp;
+        int err;
+        if (list_empty(&mddev->disks)) {
+                MD_BUG();
+                return;
+        }
+        printk(KERN_INFO "md: running: ");
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                char b[BDEVNAME_SIZE];
+                printk("<%s>", bdevname(rdev->bdev,b));
+        }
+        printk("\n");
+        err = do_md_run (mddev);
+        if (err) {
+                printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
+                do_md_stop (mddev, 0);
+        }
+}
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in pending_raid_disks)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(int part)
+{
+        struct list_head candidates;
+        struct list_head *tmp;
+        mdk_rdev_t *rdev0, *rdev;
+        mddev_t *mddev;
+        char b[BDEVNAME_SIZE];
+        printk(KERN_INFO "md: autorun ...\n");
+        while (!list_empty(&pending_raid_disks)) {
+                dev_t dev;
+                rdev0 = list_entry(pending_raid_disks.next,
+                                         mdk_rdev_t, same_set);
+                printk(KERN_INFO "md: considering %s ...\n",
+                        bdevname(rdev0->bdev,b));
+                INIT_LIST_HEAD(&candidates);
+                ITERATE_RDEV_PENDING(rdev,tmp)
+                        if (super_90_load(rdev, rdev0, 0) >= 0) {
+                                printk(KERN_INFO "md:  adding %s ...\n",
+                                        bdevname(rdev->bdev,b));
+                                list_move(&rdev->same_set, &candidates);
+                        }
+                /*
+                 * now we have a set of devices, with all of them having
+                 * mostly sane superblocks. It's time to allocate the
+                 * mddev.
+                 */
+                if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
+                        printk(KERN_INFO "md: unit number in %s is bad: %d\n",
+                               bdevname(rdev0->bdev, b), rdev0->preferred_minor);
+                        break;
+                }
+                if (part)
+                        dev = MKDEV(mdp_major,
+                                    rdev0->preferred_minor << MdpMinorShift);
+                else
+                        dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
+                md_probe(dev, NULL, NULL);
+                mddev = mddev_find(dev);
+                if (!mddev) {
+                        printk(KERN_ERR 
+                                "md: cannot allocate memory for md drive.\n");
+                        break;
+                }
+                if (mddev_lock(mddev)) 
+                        printk(KERN_WARNING "md: %s locked, cannot run\n",
+                               mdname(mddev));
+                else if (mddev->raid_disks || mddev->major_version
+                         || !list_empty(&mddev->disks)) {
+                        printk(KERN_WARNING 
+                                "md: %s already running, cannot run %s\n",
+                                mdname(mddev), bdevname(rdev0->bdev,b));
+                        mddev_unlock(mddev);
+                } else {
+                        printk(KERN_INFO "md: created %s\n", mdname(mddev));
+                        ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
+                                list_del_init(&rdev->same_set);
+                                if (bind_rdev_to_array(rdev, mddev))
+                                        export_rdev(rdev);
+                        }
+                        autorun_array(mddev);
+                        mddev_unlock(mddev);
+                }
+                /* on success, candidates will be empty, on error
+                 * it won't...
+                 */
+                ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
+                        export_rdev(rdev);
+                mddev_put(mddev);
+        }
+        printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+static int autostart_array(dev_t startdev)
+{
+        char b[BDEVNAME_SIZE];
+        int err = -EINVAL, i;
+        mdp_super_t *sb = NULL;
+        mdk_rdev_t *start_rdev = NULL, *rdev;
+        start_rdev = md_import_device(startdev, 0, 0);
+        if (IS_ERR(start_rdev))
+                return err;
+        /* NOTE: this can only work for 0.90.0 superblocks */
+        sb = (mdp_super_t*)page_address(start_rdev->sb_page);
+        if (sb->major_version != 0 ||
+            sb->minor_version != 90 ) {
+                printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
+                export_rdev(start_rdev);
+                return err;
+        }
+        if (start_rdev->faulty) {
+                printk(KERN_WARNING 
+                        "md: can not autostart based on faulty %s!\n",
+                        bdevname(start_rdev->bdev,b));
+                export_rdev(start_rdev);
+                return err;
+        }
+        list_add(&start_rdev->same_set, &pending_raid_disks);
+        for (i = 0; i < MD_SB_DISKS; i++) {
+                mdp_disk_t *desc = sb->disks + i;
+                dev_t dev = MKDEV(desc->major, desc->minor);
+                if (!dev)
+                        continue;
+                if (dev == startdev)
+                        continue;
+                if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
+                        continue;
+                rdev = md_import_device(dev, 0, 0);
+                if (IS_ERR(rdev))
+                        continue;
+                list_add(&rdev->same_set, &pending_raid_disks);
+        }
+        /*
+         * possibly return codes
+         */
+        autorun_devices(0);
+        return 0;
+}
+static int get_version(void __user * arg)
+{
+        mdu_version_t ver;
+        ver.major = MD_MAJOR_VERSION;
+        ver.minor = MD_MINOR_VERSION;
+        ver.patchlevel = MD_PATCHLEVEL_VERSION;
+        if (copy_to_user(arg, &ver, sizeof(ver)))
+                return -EFAULT;
+        return 0;
+}
+static int get_array_info(mddev_t * mddev, void __user * arg)
+{
+        mdu_array_info_t info;
+        int nr,working,active,failed,spare;
+        mdk_rdev_t *rdev;
+        struct list_head *tmp;
+        nr=working=active=failed=spare=0;
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                nr++;
+                if (rdev->faulty)
+                        failed++;
+                else {
+                        working++;
+                        if (rdev->in_sync)
+                                active++;       
+                        else
+                                spare++;
+                }
+        }
+        info.major_version = mddev->major_version;
+        info.minor_version = mddev->minor_version;
+        info.patch_version = MD_PATCHLEVEL_VERSION;
+        info.ctime         = mddev->ctime;
+        info.level         = mddev->level;
+        info.size          = mddev->size;
+        info.nr_disks      = nr;
+        info.raid_disks    = mddev->raid_disks;
+        info.md_minor      = mddev->md_minor;
+        info.not_persistent= !mddev->persistent;
+        info.utime         = mddev->utime;
+        info.state         = 0;
+        if (mddev->in_sync)
+                info.state = (1<<MD_SB_CLEAN);
+        info.active_disks  = active;
+        info.working_disks = working;
+        info.failed_disks  = failed;
+        info.spare_disks   = spare;
+        info.layout        = mddev->layout;
+        info.chunk_size    = mddev->chunk_size;
+        if (copy_to_user(arg, &info, sizeof(info)))
+                return -EFAULT;
+        return 0;
+}
+static int get_disk_info(mddev_t * mddev, void __user * arg)
+{
+        mdu_disk_info_t info;
+        unsigned int nr;
+        mdk_rdev_t *rdev;
+        if (copy_from_user(&info, arg, sizeof(info)))
+                return -EFAULT;
+        nr = info.number;
+        rdev = find_rdev_nr(mddev, nr);
+        if (rdev) {
+                info.major = MAJOR(rdev->bdev->bd_dev);
+                info.minor = MINOR(rdev->bdev->bd_dev);
+                info.raid_disk = rdev->raid_disk;
+                info.state = 0;
+                if (rdev->faulty)
+                        info.state |= (1<<MD_DISK_FAULTY);
+                else if (rdev->in_sync) {
+                        info.state |= (1<<MD_DISK_ACTIVE);
+                        info.state |= (1<<MD_DISK_SYNC);
+                }
+        } else {
+                info.major = info.minor = 0;
+                info.raid_disk = -1;
+                info.state = (1<<MD_DISK_REMOVED);
+        }
+        if (copy_to_user(arg, &info, sizeof(info)))
+                return -EFAULT;
+        return 0;
+}
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+        mdk_rdev_t *rdev;
+        dev_t dev = MKDEV(info->major,info->minor);
+        if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
+                return -EOVERFLOW;
+        if (!mddev->raid_disks) {
+                int err;
+                /* expecting a device which has a superblock */
+                rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
+                if (IS_ERR(rdev)) {
+                        printk(KERN_WARNING 
+                                "md: md_import_device returned %ld\n",
+                                PTR_ERR(rdev));
+                        return PTR_ERR(rdev);
+                }
+                if (!list_empty(&mddev->disks)) {
+                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
+                                                        mdk_rdev_t, same_set);
+                        int err = super_types[mddev->major_version]
+                                .load_super(rdev, rdev0, mddev->minor_version);
+                        if (err < 0) {
+                                printk(KERN_WARNING 
+                                        "md: %s has different UUID to %s\n",
+                                        bdevname(rdev->bdev,b), 
+                                        bdevname(rdev0->bdev,b2));
+                                export_rdev(rdev);
+                                return -EINVAL;
+                        }
+                }
+                err = bind_rdev_to_array(rdev, mddev);
+                if (err)
+                        export_rdev(rdev);
+                return err;
+        }
+        /*
+         * add_new_disk can be used once the array is assembled
+         * to add "hot spares".  They must already have a superblock
+         * written
+         */
+        if (mddev->pers) {
+                int err;
+                if (!mddev->pers->hot_add_disk) {
+                        printk(KERN_WARNING 
+                                "%s: personality does not support diskops!\n",
+                               mdname(mddev));
+                        return -EINVAL;
+                }
+                rdev = md_import_device(dev, mddev->major_version,
+                                        mddev->minor_version);
+                if (IS_ERR(rdev)) {
+                        printk(KERN_WARNING 
+                                "md: md_import_device returned %ld\n",
+                                PTR_ERR(rdev));
+                        return PTR_ERR(rdev);
+                }
+                rdev->in_sync = 0; /* just to be sure */
+                rdev->raid_disk = -1;
+                err = bind_rdev_to_array(rdev, mddev);
+                if (err)
+                        export_rdev(rdev);
+                if (mddev->thread)
+                        md_wakeup_thread(mddev->thread);
+                return err;
+        }
+        /* otherwise, add_new_disk is only allowed
+         * for major_version==0 superblocks
+         */
+        if (mddev->major_version != 0) {
+                printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
+                       mdname(mddev));
+                return -EINVAL;
+        }
+        if (!(info->state & (1<<MD_DISK_FAULTY))) {
+                int err;
+                rdev = md_import_device (dev, -1, 0);
+                if (IS_ERR(rdev)) {
+                        printk(KERN_WARNING 
+                                "md: error, md_import_device() returned %ld\n",
+                                PTR_ERR(rdev));
+                        return PTR_ERR(rdev);
+                }
+                rdev->desc_nr = info->number;
+                if (info->raid_disk < mddev->raid_disks)
+                        rdev->raid_disk = info->raid_disk;
+                else
+                        rdev->raid_disk = -1;
+                rdev->faulty = 0;
+                if (rdev->raid_disk < mddev->raid_disks)
+                        rdev->in_sync = (info->state & (1<<MD_DISK_SYNC));
+                else
+                        rdev->in_sync = 0;
+                err = bind_rdev_to_array(rdev, mddev);
+                if (err) {
+                        export_rdev(rdev);
+                        return err;
+                }
+                if (!mddev->persistent) {
+                        printk(KERN_INFO "md: nonpersistent superblock ...\n");
+                        rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+                } else 
+                        rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+                rdev->size = calc_dev_size(rdev, mddev->chunk_size);
+                if (!mddev->size || (mddev->size > rdev->size))
+                        mddev->size = rdev->size;
+        }
+        return 0;
+}
+static int hot_remove_disk(mddev_t * mddev, dev_t dev)
+{
+        char b[BDEVNAME_SIZE];
+        mdk_rdev_t *rdev;
+        if (!mddev->pers)
+                return -ENODEV;
+        rdev = find_rdev(mddev, dev);
+        if (!rdev)
+                return -ENXIO;
+        if (rdev->raid_disk >= 0)
+                goto busy;
+        kick_rdev_from_array(rdev);
+        md_update_sb(mddev);
+        return 0;
+busy:
+        printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
+                bdevname(rdev->bdev,b), mdname(mddev));
+        return -EBUSY;
+}
+static int hot_add_disk(mddev_t * mddev, dev_t dev)
+{
+        char b[BDEVNAME_SIZE];
+        int err;
+        unsigned int size;
+        mdk_rdev_t *rdev;
+        if (!mddev->pers)
+                return -ENODEV;
+        if (mddev->major_version != 0) {
+                printk(KERN_WARNING "%s: HOT_ADD may only be used with"
+                        " version-0 superblocks.\n",
+                        mdname(mddev));
+                return -EINVAL;
+        }
+        if (!mddev->pers->hot_add_disk) {
+                printk(KERN_WARNING 
+                        "%s: personality does not support diskops!\n",
+                        mdname(mddev));
+                return -EINVAL;
+        }
+        rdev = md_import_device (dev, -1, 0);
+        if (IS_ERR(rdev)) {
+                printk(KERN_WARNING 
+                        "md: error, md_import_device() returned %ld\n",
+                        PTR_ERR(rdev));
+                return -EINVAL;
+        }
+        if (mddev->persistent)
+                rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+        else
+                rdev->sb_offset =
+                        rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+        size = calc_dev_size(rdev, mddev->chunk_size);
+        rdev->size = size;
+        if (size < mddev->size) {
+                printk(KERN_WARNING 
+                        "%s: disk size %llu blocks < array size %llu\n",
+                        mdname(mddev), (unsigned long long)size,
+                        (unsigned long long)mddev->size);
+                err = -ENOSPC;
+                goto abort_export;
+        }
+        if (rdev->faulty) {
+                printk(KERN_WARNING 
+                        "md: can not hot-add faulty %s disk to %s!\n",
+                        bdevname(rdev->bdev,b), mdname(mddev));
+                err = -EINVAL;
+                goto abort_export;
+        }
+        rdev->in_sync = 0;
+        rdev->desc_nr = -1;
+        bind_rdev_to_array(rdev, mddev);
+        /*
+         * The rest should better be atomic, we can have disk failures
+         * noticed in interrupt contexts ...
+         */
+        if (rdev->desc_nr == mddev->max_disks) {
+                printk(KERN_WARNING "%s: can not hot-add to full array!\n",
+                        mdname(mddev));
+                err = -EBUSY;
+                goto abort_unbind_export;
+        }
+        rdev->raid_disk = -1;
+        md_update_sb(mddev);
+        /*
+         * Kick recovery, maybe this spare has to be added to the
+         * array immediately.
+         */
+        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+        md_wakeup_thread(mddev->thread);
+        return 0;
+abort_unbind_export:
+        unbind_rdev_from_array(rdev);
+abort_export:
+        export_rdev(rdev);
+        return err;
+}
+/*
+ * set_array_info is used two different ways
+ * The original usage is when creating a new array.
+ * In this usage, raid_disks is > 0 and it together with
+ *  level, size, not_persistent,layout,chunksize determine the
+ *  shape of the array.
+ *  This will always create an array with a type-0.90.0 superblock.
+ * The newer usage is when assembling an array.
+ *  In this case raid_disks will be 0, and the major_version field is
+ *  use to determine which style super-blocks are to be found on the devices.
+ *  The minor and patch _version numbers are also kept incase the
+ *  super_block handler wishes to interpret them.
+ */
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+        if (info->raid_disks == 0) {
+                /* just setting version number for superblock loading */
+                if (info->major_version < 0 ||
+                    info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
+                    super_types[info->major_version].name == NULL) {
+                        /* maybe try to auto-load a module? */
+                        printk(KERN_INFO 
+                                "md: superblock version %d not known\n",
+                                info->major_version);
+                        return -EINVAL;
+                }
+                mddev->major_version = info->major_version;
+                mddev->minor_version = info->minor_version;
+                mddev->patch_version = info->patch_version;
+                return 0;
+        }
+        mddev->major_version = MD_MAJOR_VERSION;
+        mddev->minor_version = MD_MINOR_VERSION;
+        mddev->patch_version = MD_PATCHLEVEL_VERSION;
+        mddev->ctime         = get_seconds();
+        mddev->level         = info->level;
+        mddev->size          = info->size;
+        mddev->raid_disks    = info->raid_disks;
+        /* don't set md_minor, it is determined by which /dev/md* was
+         * openned
+         */
+        if (info->state & (1<<MD_SB_CLEAN))
+                mddev->recovery_cp = MaxSector;
+        else
+                mddev->recovery_cp = 0;
+        mddev->persistent    = ! info->not_persistent;
+        mddev->layout        = info->layout;
+        mddev->chunk_size    = info->chunk_size;
+        mddev->max_disks     = MD_SB_DISKS;
+        mddev->sb_dirty      = 1;
+        /*
+         * Generate a 128 bit UUID
+         */
+        get_random_bytes(mddev->uuid, 16);
+        return 0;
+}
+/*
+ * update_array_info is used to change the configuration of an
+ * on-line array.
+ * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
+ * fields in the info are checked against the array.
+ * Any differences that cannot be handled will cause an error.
+ * Normally, only one change can be managed at a time.
+ */
+static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
+{
+        int rv = 0;
+        int cnt = 0;
+        if (mddev->major_version != info->major_version ||
+            mddev->minor_version != info->minor_version ||
+/*          mddev->patch_version != info->patch_version || */
+            mddev->ctime         != info->ctime         ||
+            mddev->level         != info->level         ||
+/*          mddev->layout        != info->layout        || */
+            !mddev->persistent   != info->not_persistent||
+            mddev->chunk_size    != info->chunk_size    )
+                return -EINVAL;
+        /* Check there is only one change */
+        if (mddev->size != info->size) cnt++;
+        if (mddev->raid_disks != info->raid_disks) cnt++;
+        if (mddev->layout != info->layout) cnt++;
+        if (cnt == 0) return 0;
+        if (cnt > 1) return -EINVAL;
+        if (mddev->layout != info->layout) {
+                /* Change layout
+                 * we don't need to do anything at the md level, the
+                 * personality will take care of it all.
+                 */
+                if (mddev->pers->reconfig == NULL)
+                        return -EINVAL;
+                else
+                        return mddev->pers->reconfig(mddev, info->layout, -1);
+        }
+        if (mddev->size != info->size) {
+                mdk_rdev_t * rdev;
+                struct list_head *tmp;
+                if (mddev->pers->resize == NULL)
+                        return -EINVAL;
+                /* The "size" is the amount of each device that is used.
+                 * This can only make sense for arrays with redundancy.
+                 * linear and raid0 always use whatever space is available
+                 * We can only consider changing the size if no resync
+                 * or reconstruction is happening, and if the new size
+                 * is acceptable. It must fit before the sb_offset or,
+                 * if that is <data_offset, it must fit before the
+                 * size of each device.
+                 * If size is zero, we find the largest size that fits.
+                 */
+                if (mddev->sync_thread)
+                        return -EBUSY;
+                ITERATE_RDEV(mddev,rdev,tmp) {
+                        sector_t avail;
+                        int fit = (info->size == 0);
+                        if (rdev->sb_offset > rdev->data_offset)
+                                avail = (rdev->sb_offset*2) - rdev->data_offset;
+                        else
+                                avail = get_capacity(rdev->bdev->bd_disk)
+                                        - rdev->data_offset;
+                        if (fit && (info->size == 0 || info->size > avail/2))
+                                info->size = avail/2;
+                        if (avail < ((sector_t)info->size << 1))
+                                return -ENOSPC;
+                }
+                rv = mddev->pers->resize(mddev, (sector_t)info->size *2);
+                if (!rv) {
+                        struct block_device *bdev;
+                        bdev = bdget_disk(mddev->gendisk, 0);
+                        if (bdev) {
+                                down(&bdev->bd_inode->i_sem);
+                                i_size_write(bdev->bd_inode, mddev->array_size << 10);
+                                up(&bdev->bd_inode->i_sem);
+                                bdput(bdev);
+                        }
+                }
+        }
+        if (mddev->raid_disks    != info->raid_disks) {
+                /* change the number of raid disks */
+                if (mddev->pers->reshape == NULL)
+                        return -EINVAL;
+                if (info->raid_disks <= 0 ||
+                    info->raid_disks >= mddev->max_disks)
+                        return -EINVAL;
+                if (mddev->sync_thread)
+                        return -EBUSY;
+                rv = mddev->pers->reshape(mddev, info->raid_disks);
+                if (!rv) {
+                        struct block_device *bdev;
+                        bdev = bdget_disk(mddev->gendisk, 0);
+                        if (bdev) {
+                                down(&bdev->bd_inode->i_sem);
+                                i_size_write(bdev->bd_inode, mddev->array_size << 10);
+                                up(&bdev->bd_inode->i_sem);
+                                bdput(bdev);
+                        }
+                }
+        }
+        md_update_sb(mddev);
+        return rv;
+}
+static int set_disk_faulty(mddev_t *mddev, dev_t dev)
+{
+        mdk_rdev_t *rdev;
+        if (mddev->pers == NULL)
+                return -ENODEV;
+        rdev = find_rdev(mddev, dev);
+        if (!rdev)
+                return -ENODEV;
+        md_error(mddev, rdev);
+        return 0;
+}
+static int md_ioctl(struct inode *inode, struct file *file,
+                        unsigned int cmd, unsigned long arg)
+{
+        int err = 0;
+        void __user *argp = (void __user *)arg;
+        struct hd_geometry __user *loc = argp;
+        mddev_t *mddev = NULL;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        /*
+         * Commands dealing with the RAID driver but not any
+         * particular array:
+         */
+        switch (cmd)
+        {
+                case RAID_VERSION:
+                        err = get_version(argp);
+                        goto done;
+                case PRINT_RAID_DEBUG:
+                        err = 0;
+                        md_print_devices();
+                        goto done;
+#ifndef MODULE
+                case RAID_AUTORUN:
+                        err = 0;
+                        autostart_arrays(arg);
+                        goto done;
+#endif
+                default:;
+        }
+        /*
+         * Commands creating/starting a new array:
+         */
+        mddev = inode->i_bdev->bd_disk->private_data;
+        if (!mddev) {
+                BUG();
+                goto abort;
+        }
+        if (cmd == START_ARRAY) {
+                /* START_ARRAY doesn't need to lock the array as autostart_array
+                 * does the locking, and it could even be a different array
+                 */
+                static int cnt = 3;
+                if (cnt > 0 ) {
+                        printk(KERN_WARNING
+                               "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
+                               "This will not be supported beyond 2.6\n",
+                               current->comm, current->pid);
+                        cnt--;
+                }
+                err = autostart_array(new_decode_dev(arg));
+                if (err) {
+                        printk(KERN_WARNING "md: autostart failed!\n");
+                        goto abort;
+                }
+                goto done;
+        }
+        err = mddev_lock(mddev);
+        if (err) {
+                printk(KERN_INFO 
+                        "md: ioctl lock interrupted, reason %d, cmd %d\n",
+                        err, cmd);
+                goto abort;
+        }
+        switch (cmd)
+        {
+                case SET_ARRAY_INFO:
+                        {
+                                mdu_array_info_t info;
+                                if (!arg)
+                                        memset(&info, 0, sizeof(info));
+                                else if (copy_from_user(&info, argp, sizeof(info))) {
+                                        err = -EFAULT;
+                                        goto abort_unlock;
+                                }
+                                if (mddev->pers) {
+                                        err = update_array_info(mddev, &info);
+                                        if (err) {
+                                                printk(KERN_WARNING "md: couldn't update"
+                                                       " array info. %d\n", err);
+                                                goto abort_unlock;
+                                        }
+                                        goto done_unlock;
+                                }
+                                if (!list_empty(&mddev->disks)) {
+                                        printk(KERN_WARNING
+                                               "md: array %s already has disks!\n",
+                                               mdname(mddev));
+                                        err = -EBUSY;
+                                        goto abort_unlock;
+                                }
+                                if (mddev->raid_disks) {
+                                        printk(KERN_WARNING
+                                               "md: array %s already initialised!\n",
+                                               mdname(mddev));
+                                        err = -EBUSY;
+                                        goto abort_unlock;
+                                }
+                                err = set_array_info(mddev, &info);
+                                if (err) {
+                                        printk(KERN_WARNING "md: couldn't set"
+                                               " array info. %d\n", err);
+                                        goto abort_unlock;
+                                }
+                        }
+                        goto done_unlock;
+                default:;
+        }
+        /*
+         * Commands querying/configuring an existing array:
+         */
+        /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
+        if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+                err = -ENODEV;
+                goto abort_unlock;
+        }
+        /*
+         * Commands even a read-only array can execute:
+         */
+        switch (cmd)
+        {
+                case GET_ARRAY_INFO:
+                        err = get_array_info(mddev, argp);
+                        goto done_unlock;
+                case GET_DISK_INFO:
+                        err = get_disk_info(mddev, argp);
+                        goto done_unlock;
+                case RESTART_ARRAY_RW:
+                        err = restart_array(mddev);
+                        goto done_unlock;
+                case STOP_ARRAY:
+                        err = do_md_stop (mddev, 0);
+                        goto done_unlock;
+                case STOP_ARRAY_RO:
+                        err = do_md_stop (mddev, 1);
+                        goto done_unlock;
+        /*
+         * We have a problem here : there is no easy way to give a CHS
+         * virtual geometry. We currently pretend that we have a 2 heads
+         * 4 sectors (with a BIG number of cylinders...). This drives
+         * dosfs just mad... ;-)
+         */
+                case HDIO_GETGEO:
+                        if (!loc) {
+                                err = -EINVAL;
+                                goto abort_unlock;
+                        }
+                        err = put_user (2, (char __user *) &loc->heads);
+                        if (err)
+                                goto abort_unlock;
+                        err = put_user (4, (char __user *) &loc->sectors);
+                        if (err)
+                                goto abort_unlock;
+                        err = put_user(get_capacity(mddev->gendisk)/8,
+                                        (short __user *) &loc->cylinders);
+                        if (err)
+                                goto abort_unlock;
+                        err = put_user (get_start_sect(inode->i_bdev),
+                                                (long __user *) &loc->start);
+                        goto done_unlock;
+        }
+        /*
+         * The remaining ioctls are changing the state of the
+         * superblock, so we do not allow read-only arrays
+         * here:
+         */
+        if (mddev->ro) {
+                err = -EROFS;
+                goto abort_unlock;
+        }
+        switch (cmd)
+        {
+                case ADD_NEW_DISK:
+                {
+                        mdu_disk_info_t info;
+                        if (copy_from_user(&info, argp, sizeof(info)))
+                                err = -EFAULT;
+                        else
+                                err = add_new_disk(mddev, &info);
+                        goto done_unlock;
+                }
+                case HOT_REMOVE_DISK:
+                        err = hot_remove_disk(mddev, new_decode_dev(arg));
+                        goto done_unlock;
+                case HOT_ADD_DISK:
+                        err = hot_add_disk(mddev, new_decode_dev(arg));
+                        goto done_unlock;
+                case SET_DISK_FAULTY:
+                        err = set_disk_faulty(mddev, new_decode_dev(arg));
+                        goto done_unlock;
+                case RUN_ARRAY:
+                        err = do_md_run (mddev);
+                        goto done_unlock;
+                default:
+                        if (_IOC_TYPE(cmd) == MD_MAJOR)
+                                printk(KERN_WARNING "md: %s(pid %d) used"
+                                        " obsolete MD ioctl, upgrade your"
+                                        " software to use new ictls.\n",
+                                        current->comm, current->pid);
+                        err = -EINVAL;
+                        goto abort_unlock;
+        }
+done_unlock:
+abort_unlock:
+        mddev_unlock(mddev);
+        return err;
+done:
+        if (err)
+                MD_BUG();
+abort:
+        return err;
+}
+static int md_open(struct inode *inode, struct file *file)
+{
+        /*
+         * Succeed if we can lock the mddev, which confirms that
+         * it isn't being stopped right now.
+         */
+        mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
+        int err;
+        if ((err = mddev_lock(mddev)))
+                goto out;
+        err = 0;
+        mddev_get(mddev);
+        mddev_unlock(mddev);
+        check_disk_change(inode->i_bdev);
+ out:
+        return err;
+}
+static int md_release(struct inode *inode, struct file * file)
+{
+        mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
+        if (!mddev)
+                BUG();
+        mddev_put(mddev);
+        return 0;
+}
+static int md_media_changed(struct gendisk *disk)
+{
+        mddev_t *mddev = disk->private_data;
+        return mddev->changed;
+}
+static int md_revalidate(struct gendisk *disk)
+{
+        mddev_t *mddev = disk->private_data;
+        mddev->changed = 0;
+        return 0;
+}
+static struct block_device_operations md_fops =
+{
+        .owner          = THIS_MODULE,
+        .open           = md_open,
+        .release        = md_release,
+        .ioctl          = md_ioctl,
+        .media_changed  = md_media_changed,
+        .revalidate_disk= md_revalidate,
+};
+int md_thread(void * arg)
+{
+        mdk_thread_t *thread = arg;
+        lock_kernel();
+        /*
+         * Detach thread
+         */
+        daemonize(thread->name, mdname(thread->mddev));
+        current->exit_signal = SIGCHLD;
+        allow_signal(SIGKILL);
+        thread->tsk = current;
+        /*
+         * md_thread is a 'system-thread', it's priority should be very
+         * high. We avoid resource deadlocks individually in each
+         * raid personality. (RAID5 does preallocation) We also use RR and
+         * the very same RT priority as kswapd, thus we will never get
+         * into a priority inversion deadlock.
+         *
+         * we definitely have to have equal or higher priority than
+         * bdflush, otherwise bdflush will deadlock if there are too
+         * many dirty RAID5 blocks.
+         */
+        unlock_kernel();
+        complete(thread->event);
+        while (thread->run) {
+                void (*run)(mddev_t *);
+                wait_event_interruptible(thread->wqueue,
+                                         test_bit(THREAD_WAKEUP, &thread->flags));
+                if (current->flags & PF_FREEZE)
+                        refrigerator(PF_FREEZE);
+                clear_bit(THREAD_WAKEUP, &thread->flags);
+                run = thread->run;
+                if (run)
+                        run(thread->mddev);
+                if (signal_pending(current))
+                        flush_signals(current);
+        }
+        complete(thread->event);
+        return 0;
+}
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+        if (thread) {
+                dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
+                set_bit(THREAD_WAKEUP, &thread->flags);
+                wake_up(&thread->wqueue);
+        }
+}
+mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
+                                 const char *name)
+{
+        mdk_thread_t *thread;
+        int ret;
+        struct completion event;
+        thread = (mdk_thread_t *) kmalloc
+                                (sizeof(mdk_thread_t), GFP_KERNEL);
+        if (!thread)
+                return NULL;
+        memset(thread, 0, sizeof(mdk_thread_t));
+        init_waitqueue_head(&thread->wqueue);
+        init_completion(&event);
+        thread->event = &event;
+        thread->run = run;
+        thread->mddev = mddev;
+        thread->name = name;
+        ret = kernel_thread(md_thread, thread, 0);
+        if (ret < 0) {
+                kfree(thread);
+                return NULL;
+        }
+        wait_for_completion(&event);
+        return thread;
+}
+static void md_interrupt_thread(mdk_thread_t *thread)
+{
+        if (!thread->tsk) {
+                MD_BUG();
+                return;
+        }
+        dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+        send_sig(SIGKILL, thread->tsk, 1);
+}
+void md_unregister_thread(mdk_thread_t *thread)
+{
+        struct completion event;
+        init_completion(&event);
+        thread->event = &event;
+        thread->run = NULL;
+        thread->name = NULL;
+        md_interrupt_thread(thread);
+        wait_for_completion(&event);
+        kfree(thread);
+}
+void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        if (!mddev) {
+                MD_BUG();
+                return;
+        }
+        if (!rdev || rdev->faulty)
+                return;
+        dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+                mdname(mddev),
+                MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
+                __builtin_return_address(0),__builtin_return_address(1),
+                __builtin_return_address(2),__builtin_return_address(3));
+        if (!mddev->pers->error_handler)
+                return;
+        mddev->pers->error_handler(mddev,rdev);
+        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+        md_wakeup_thread(mddev->thread);
+}
+/* seq_file implementation /proc/mdstat */
+static void status_unused(struct seq_file *seq)
+{
+        int i = 0;
+        mdk_rdev_t *rdev;
+        struct list_head *tmp;
+        seq_printf(seq, "unused devices: ");
+        ITERATE_RDEV_PENDING(rdev,tmp) {
+                char b[BDEVNAME_SIZE];
+                i++;
+                seq_printf(seq, "%s ",
+                              bdevname(rdev->bdev,b));
+        }
+        if (!i)
+                seq_printf(seq, "<none>");
+        seq_printf(seq, "\n");
+}
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+        unsigned long max_blocks, resync, res, dt, db, rt;
+        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+                max_blocks = mddev->resync_max_sectors >> 1;
+        else
+                max_blocks = mddev->size;
+        /*
+         * Should not happen.
+         */
+        if (!max_blocks) {
+                MD_BUG();
+                return;
+        }
+        res = (resync/1024)*1000/(max_blocks/1024 + 1);
+        {
+                int i, x = res/50, y = 20-x;
+                seq_printf(seq, "[");
+                for (i = 0; i < x; i++)
+                        seq_printf(seq, "=");
+                seq_printf(seq, ">");
+                for (i = 0; i < y; i++)
+                        seq_printf(seq, ".");
+                seq_printf(seq, "] ");
+        }
+        seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
+                      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+                       "resync" : "recovery"),
+                      res/10, res % 10, resync, max_blocks);
+        /*
+         * We do not want to overflow, so the order of operands and
+         * the * 100 / 100 trick are important. We do a +1 to be
+         * safe against division by zero. We only estimate anyway.
+         *
+         * dt: time from mark until now
+         * db: blocks written from mark until now
+         * rt: remaining time
+         */
+        dt = ((jiffies - mddev->resync_mark) / HZ);
+        if (!dt) dt++;
+        db = resync - (mddev->resync_mark_cnt/2);
+        rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+        seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+        seq_printf(seq, " speed=%ldK/sec", db/dt);
+}
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        struct list_head *tmp;
+        loff_t l = *pos;
+        mddev_t *mddev;
+        if (l >= 0x10000)
+                return NULL;
+        if (!l--)
+                /* header */
+                return (void*)1;
+        spin_lock(&all_mddevs_lock);
+        list_for_each(tmp,&all_mddevs)
+                if (!l--) {
+                        mddev = list_entry(tmp, mddev_t, all_mddevs);
+                        mddev_get(mddev);
+                        spin_unlock(&all_mddevs_lock);
+                        return mddev;
+                }
+        spin_unlock(&all_mddevs_lock);
+        if (!l--)
+                return (void*)2;/* tail */
+        return NULL;
+}
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct list_head *tmp;
+        mddev_t *next_mddev, *mddev = v;
+        
+        ++*pos;
+        if (v == (void*)2)
+                return NULL;
+        spin_lock(&all_mddevs_lock);
+        if (v == (void*)1)
+                tmp = all_mddevs.next;
+        else
+                tmp = mddev->all_mddevs.next;
+        if (tmp != &all_mddevs)
+                next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
+        else {
+                next_mddev = (void*)2;
+                *pos = 0x10000;
+        }               
+        spin_unlock(&all_mddevs_lock);
+        if (v != (void*)1)
+                mddev_put(mddev);
+        return next_mddev;
+}
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+        mddev_t *mddev = v;
+        if (mddev && v != (void*)1 && v != (void*)2)
+                mddev_put(mddev);
+}
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+        mddev_t *mddev = v;
+        sector_t size;
+        struct list_head *tmp2;
+        mdk_rdev_t *rdev;
+        int i;
+        if (v == (void*)1) {
+                seq_printf(seq, "Personalities : ");
+                spin_lock(&pers_lock);
+                for (i = 0; i < MAX_PERSONALITY; i++)
+                        if (pers[i])
+                                seq_printf(seq, "[%s] ", pers[i]->name);
+                spin_unlock(&pers_lock);
+                seq_printf(seq, "\n");
+                return 0;
+        }
+        if (v == (void*)2) {
+                status_unused(seq);
+                return 0;
+        }
+        if (mddev_lock(mddev)!=0) 
+                return -EINTR;
+        if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
+                seq_printf(seq, "%s : %sactive", mdname(mddev),
+                                                mddev->pers ? "" : "in");
+                if (mddev->pers) {
+                        if (mddev->ro)
+                                seq_printf(seq, " (read-only)");
+                        seq_printf(seq, " %s", mddev->pers->name);
+                }
+                size = 0;
+                ITERATE_RDEV(mddev,rdev,tmp2) {
+                        char b[BDEVNAME_SIZE];
+                        seq_printf(seq, " %s[%d]",
+                                bdevname(rdev->bdev,b), rdev->desc_nr);
+                        if (rdev->faulty) {
+                                seq_printf(seq, "(F)");
+                                continue;
+                        }
+                        size += rdev->size;
+                }
+                if (!list_empty(&mddev->disks)) {
+                        if (mddev->pers)
+                                seq_printf(seq, "\n      %llu blocks",
+                                        (unsigned long long)mddev->array_size);
+                        else
+                                seq_printf(seq, "\n      %llu blocks",
+                                        (unsigned long long)size);
+                }
+                if (mddev->pers) {
+                        mddev->pers->status (seq, mddev);
+                        seq_printf(seq, "\n      ");
+                        if (mddev->curr_resync > 2)
+                                status_resync (seq, mddev);
+                        else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
+                                seq_printf(seq, "       resync=DELAYED");
+                }
+                seq_printf(seq, "\n");
+        }
+        mddev_unlock(mddev);
+        
+        return 0;
+}
+static struct seq_operations md_seq_ops = {
+        .start  = md_seq_start,
+        .next   = md_seq_next,
+        .stop   = md_seq_stop,
+        .show   = md_seq_show,
+};
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+        int error;
+        error = seq_open(file, &md_seq_ops);
+        return error;
+}
+static struct file_operations md_seq_fops = {
+        .open           = md_seq_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+int register_md_personality(int pnum, mdk_personality_t *p)
+{
+        if (pnum >= MAX_PERSONALITY) {
+                printk(KERN_ERR
+                       "md: tried to install personality %s as nr %d, but max is %lu\n",
+                       p->name, pnum, MAX_PERSONALITY-1);
+                return -EINVAL;
+        }
+        spin_lock(&pers_lock);
+        if (pers[pnum]) {
+                spin_unlock(&pers_lock);
+                MD_BUG();
+                return -EBUSY;
+        }
+        pers[pnum] = p;
+        printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+        spin_unlock(&pers_lock);
+        return 0;
+}
+int unregister_md_personality(int pnum)
+{
+        if (pnum >= MAX_PERSONALITY) {
+                MD_BUG();
+                return -EINVAL;
+        }
+        printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+        spin_lock(&pers_lock);
+        pers[pnum] = NULL;
+        spin_unlock(&pers_lock);
+        return 0;
+}
+static int is_mddev_idle(mddev_t *mddev)
+{
+        mdk_rdev_t * rdev;
+        struct list_head *tmp;
+        int idle;
+        unsigned long curr_events;
+        idle = 1;
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+                curr_events = disk_stat_read(disk, read_sectors) + 
+                                disk_stat_read(disk, write_sectors) - 
+                                atomic_read(&disk->sync_io);
+                /* Allow some slack between valud of curr_events and last_events,
+                 * as there are some uninteresting races.
+                 * Note: the following is an unsigned comparison.
+                 */
+                if ((curr_events - rdev->last_events + 32) > 64) {
+                        rdev->last_events = curr_events;
+                        idle = 0;
+                }
+        }
+        return idle;
+}
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+        /* another "blocks" (512byte) blocks have been synced */
+        atomic_sub(blocks, &mddev->recovery_active);
+        wake_up(&mddev->recovery_wait);
+        if (!ok) {
+                set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+                md_wakeup_thread(mddev->thread);
+                // stop recovery, signal do_sync ....
+        }
+}
+void md_write_start(mddev_t *mddev)
+{
+        if (!atomic_read(&mddev->writes_pending)) {
+                mddev_lock_uninterruptible(mddev);
+                if (mddev->in_sync) {
+                        mddev->in_sync = 0;
+                        del_timer(&mddev->safemode_timer);
+                        md_update_sb(mddev);
+                }
+                atomic_inc(&mddev->writes_pending);
+                mddev_unlock(mddev);
+        } else
+                atomic_inc(&mddev->writes_pending);
+}
+void md_write_end(mddev_t *mddev)
+{
+        if (atomic_dec_and_test(&mddev->writes_pending)) {
+                if (mddev->safemode == 2)
+                        md_wakeup_thread(mddev->thread);
+                else
+                        mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
+        }
+}
+static inline void md_enter_safemode(mddev_t *mddev)
+{
+        if (!mddev->safemode) return;
+        if (mddev->safemode == 2 &&
+            (atomic_read(&mddev->writes_pending) || mddev->in_sync ||
+                    mddev->recovery_cp != MaxSector))
+                return; /* avoid the lock */
+        mddev_lock_uninterruptible(mddev);
+        if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
+            !mddev->in_sync && mddev->recovery_cp == MaxSector) {
+                mddev->in_sync = 1;
+                md_update_sb(mddev);
+        }
+        mddev_unlock(mddev);
+        if (mddev->safemode == 1)
+                mddev->safemode = 0;
+}
+void md_handle_safemode(mddev_t *mddev)
+{
+        if (signal_pending(current)) {
+                printk(KERN_INFO "md: %s in immediate safe mode\n",
+                        mdname(mddev));
+                mddev->safemode = 2;
+                flush_signals(current);
+        }
+        md_enter_safemode(mddev);
+}
+DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+#define SYNC_MARKS      10
+#define SYNC_MARK_STEP  (3*HZ)
+static void md_do_sync(mddev_t *mddev)
+{
+        mddev_t *mddev2;
+        unsigned int currspeed = 0,
+                 window;
+        sector_t max_sectors,j;
+        unsigned long mark[SYNC_MARKS];
+        sector_t mark_cnt[SYNC_MARKS];
+        int last_mark,m;
+        struct list_head *tmp;
+        sector_t last_check;
+        /* just incase thread restarts... */
+        if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+                return;
+        /* we overload curr_resync somewhat here.
+         * 0 == not engaged in resync at all
+         * 2 == checking that there is no conflict with another sync
+         * 1 == like 2, but have yielded to allow conflicting resync to
+         *              commense
+         * other == active in resync - this many blocks
+         *
+         * Before starting a resync we must have set curr_resync to
+         * 2, and then checked that every "conflicting" array has curr_resync
+         * less than ours.  When we find one that is the same or higher
+         * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
+         * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
+         * This will mean we have to start checking from the beginning again.
+         *
+         */
+        do {
+                mddev->curr_resync = 2;
+        try_again:
+                if (signal_pending(current)) {
+                        flush_signals(current);
+                        goto skip;
+                }
+                ITERATE_MDDEV(mddev2,tmp) {
+                        printk(".");
+                        if (mddev2 == mddev)
+                                continue;
+                        if (mddev2->curr_resync && 
+                            match_mddev_units(mddev,mddev2)) {
+                                DEFINE_WAIT(wq);
+                                if (mddev < mddev2 && mddev->curr_resync == 2) {
+                                        /* arbitrarily yield */
+                                        mddev->curr_resync = 1;
+                                        wake_up(&resync_wait);
+                                }
+                                if (mddev > mddev2 && mddev->curr_resync == 1)
+                                        /* no need to wait here, we can wait the next
+                                         * time 'round when curr_resync == 2
+                                         */
+                                        continue;
+                                prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
+                                if (!signal_pending(current)
+                                    && mddev2->curr_resync >= mddev->curr_resync) {
+                                        printk(KERN_INFO "md: delaying resync of %s"
+                                               " until %s has finished resync (they"
+                                               " share one or more physical units)\n",
+                                               mdname(mddev), mdname(mddev2));
+                                        mddev_put(mddev2);
+                                        schedule();
+                                        finish_wait(&resync_wait, &wq);
+                                        goto try_again;
+                                }
+                                finish_wait(&resync_wait, &wq);
+                        }
+                }
+        } while (mddev->curr_resync < 2);
+        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+                /* resync follows the size requested by the personality,
+                 * which default to physical size, but can be virtual size
+                 */
+                max_sectors = mddev->resync_max_sectors;
+        else
+                /* recovery follows the physical size of devices */
+                max_sectors = mddev->size << 1;
+        printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
+        printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
+                " %d KB/sec/disc.\n", sysctl_speed_limit_min);
+        printk(KERN_INFO "md: using maximum available idle IO bandwith "
+               "(but not more than %d KB/sec) for reconstruction.\n",
+               sysctl_speed_limit_max);
+        is_mddev_idle(mddev); /* this also initializes IO event counters */
+        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+                j = mddev->recovery_cp;
+        else
+                j = 0;
+        for (m = 0; m < SYNC_MARKS; m++) {
+                mark[m] = jiffies;
+                mark_cnt[m] = j;
+        }
+        last_mark = 0;
+        mddev->resync_mark = mark[last_mark];
+        mddev->resync_mark_cnt = mark_cnt[last_mark];
+        /*
+         * Tune reconstruction:
+         */
+        window = 32*(PAGE_SIZE/512);
+        printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
+                window/2,(unsigned long long) max_sectors/2);
+        atomic_set(&mddev->recovery_active, 0);
+        init_waitqueue_head(&mddev->recovery_wait);
+        last_check = 0;
+        if (j>2) {
+                printk(KERN_INFO 
+                        "md: resuming recovery of %s from checkpoint.\n",
+                        mdname(mddev));
+                mddev->curr_resync = j;
+        }
+        while (j < max_sectors) {
+                int sectors;
+                sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
+                if (sectors < 0) {
+                        set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+                        goto out;
+                }
+                atomic_add(sectors, &mddev->recovery_active);
+                j += sectors;
+                if (j>1) mddev->curr_resync = j;
+                if (last_check + window > j || j == max_sectors)
+                        continue;
+                last_check = j;
+                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
+                    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
+                        break;
+        repeat:
+                if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
+                        /* step marks */
+                        int next = (last_mark+1) % SYNC_MARKS;
+                        mddev->resync_mark = mark[next];
+                        mddev->resync_mark_cnt = mark_cnt[next];
+                        mark[next] = jiffies;
+                        mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+                        last_mark = next;
+                }
+                if (signal_pending(current)) {
+                        /*
+                         * got a signal, exit.
+                         */
+                        printk(KERN_INFO 
+                                "md: md_do_sync() got signal ... exiting\n");
+                        flush_signals(current);
+                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+                        goto out;
+                }
+                /*
+                 * this loop exits only if either when we are slower than
+                 * the 'hard' speed limit, or the system was IO-idle for
+                 * a jiffy.
+                 * the system might be non-idle CPU-wise, but we only care
+                 * about not overloading the IO subsystem. (things like an
+                 * e2fsck being done on the RAID array should execute fast)
+                 */
+                mddev->queue->unplug_fn(mddev->queue);
+                cond_resched();
+                currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+                if (currspeed > sysctl_speed_limit_min) {
+                        if ((currspeed > sysctl_speed_limit_max) ||
+                                        !is_mddev_idle(mddev)) {
+                                msleep_interruptible(250);
+                                goto repeat;
+                        }
+                }
+        }
+        printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
+        /*
+         * this also signals 'finished resyncing' to md_stop
+         */
+ out:
+        mddev->queue->unplug_fn(mddev->queue);
+        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+        /* tell personality that we are finished */
+        mddev->pers->sync_request(mddev, max_sectors, 1);
+        if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+            mddev->curr_resync > 2 &&
+            mddev->curr_resync >= mddev->recovery_cp) {
+                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+                        printk(KERN_INFO 
+                                "md: checkpointing recovery of %s.\n",
+                                mdname(mddev));
+                        mddev->recovery_cp = mddev->curr_resync;
+                } else
+                        mddev->recovery_cp = MaxSector;
+        }
+        md_enter_safemode(mddev);
+ skip:
+        mddev->curr_resync = 0;
+        wake_up(&resync_wait);
+        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
+        md_wakeup_thread(mddev->thread);
+}
+/*
+ * This routine is regularly called by all per-raid-array threads to
+ * deal with generic issues like resync and super-block update.
+ * Raid personalities that don't have a thread (linear/raid0) do not
+ * need this as they never do any recovery or update the superblock.
+ *
+ * It does not do any resync itself, but rather "forks" off other threads
+ * to do that as needed.
+ * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
+ * "->recovery" and create a thread at ->sync_thread.
+ * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
+ * and wakeups up this thread which will reap the thread and finish up.
+ * This thread also removes any faulty devices (with nr_pending == 0).
+ *
+ * The overall approach is:
+ *  1/ if the superblock needs updating, update it.
+ *  2/ If a recovery thread is running, don't do anything else.
+ *  3/ If recovery has finished, clean up, possibly marking spares active.
+ *  4/ If there are any faulty devices, remove them.
+ *  5/ If array is degraded, try to add spares devices
+ *  6/ If array has spares or is not in-sync, start a resync thread.
+ */
+void md_check_recovery(mddev_t *mddev)
+{
+        mdk_rdev_t *rdev;
+        struct list_head *rtmp;
+        dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
+        if (mddev->ro)
+                return;
+        if ( ! (
+                mddev->sb_dirty ||
+                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+                test_bit(MD_RECOVERY_DONE, &mddev->recovery)
+                ))
+                return;
+        if (mddev_trylock(mddev)==0) {
+                int spares =0;
+                if (mddev->sb_dirty)
+                        md_update_sb(mddev);
+                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+                    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
+                        /* resync/recovery still happening */
+                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+                        goto unlock;
+                }
+                if (mddev->sync_thread) {
+                        /* resync has finished, collect result */
+                        md_unregister_thread(mddev->sync_thread);
+                        mddev->sync_thread = NULL;
+                        if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+                            !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+                                /* success...*/
+                                /* activate any spares */
+                                mddev->pers->spare_active(mddev);
+                        }
+                        md_update_sb(mddev);
+                        mddev->recovery = 0;
+                        /* flag recovery needed just to double check */
+                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+                        goto unlock;
+                }
+                if (mddev->recovery)
+                        /* probably just the RECOVERY_NEEDED flag */
+                        mddev->recovery = 0;
+                /* no recovery is running.
+                 * remove any failed drives, then
+                 * add spares if possible.
+                 * Spare are also removed and re-added, to allow
+                 * the personality to fail the re-add.
+                 */
+                ITERATE_RDEV(mddev,rdev,rtmp)
+                        if (rdev->raid_disk >= 0 &&
+                            (rdev->faulty || ! rdev->in_sync) &&
+                            atomic_read(&rdev->nr_pending)==0) {
+                                if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0)
+                                        rdev->raid_disk = -1;
+                        }
+                if (mddev->degraded) {
+                        ITERATE_RDEV(mddev,rdev,rtmp)
+                                if (rdev->raid_disk < 0
+                                    && !rdev->faulty) {
+                                        if (mddev->pers->hot_add_disk(mddev,rdev))
+                                                spares++;
+                                        else
+                                                break;
+                                }
+                }
+                if (!spares && (mddev->recovery_cp == MaxSector )) {
+                        /* nothing we can do ... */
+                        goto unlock;
+                }
+                if (mddev->pers->sync_request) {
+                        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+                        if (!spares)
+                                set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+                        mddev->sync_thread = md_register_thread(md_do_sync,
+                                                                mddev,
+                                                                "%s_resync");
+                        if (!mddev->sync_thread) {
+                                printk(KERN_ERR "%s: could not start resync"
+                                        " thread...\n", 
+                                        mdname(mddev));
+                                /* leave the spares where they are, it shouldn't hurt */
+                                mddev->recovery = 0;
+                        } else {
+                                md_wakeup_thread(mddev->sync_thread);
+                        }
+                }
+        unlock:
+                mddev_unlock(mddev);
+        }
+}
+int md_notify_reboot(struct notifier_block *this,
+                                        unsigned long code, void *x)
+{
+        struct list_head *tmp;
+        mddev_t *mddev;
+        if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
+                printk(KERN_INFO "md: stopping all md devices.\n");
+                ITERATE_MDDEV(mddev,tmp)
+                        if (mddev_trylock(mddev)==0)
+                                do_md_stop (mddev, 1);
+                /*
+                 * certain more exotic SCSI devices are known to be
+                 * volatile wrt too early system reboots. While the
+                 * right place to handle this issue is the given
+                 * driver, we do want to have a safe RAID driver ...
+                 */
+                mdelay(1000*1);
+        }
+        return NOTIFY_DONE;
+}
+struct notifier_block md_notifier = {
+        .notifier_call  = md_notify_reboot,
+        .next           = NULL,
+        .priority       = INT_MAX, /* before any real devices */
+};
+static void md_geninit(void)
+{
+        struct proc_dir_entry *p;
+        dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+        p = create_proc_entry("mdstat", S_IRUGO, NULL);
+        if (p)
+                p->proc_fops = &md_seq_fops;
+}
+int __init md_init(void)
+{
+        int minor;
+        printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
+                        " MD_SB_DISKS=%d\n",
+                        MD_MAJOR_VERSION, MD_MINOR_VERSION,
+                        MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+        if (register_blkdev(MAJOR_NR, "md"))
+                return -1;
+        if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
+                unregister_blkdev(MAJOR_NR, "md");
+                return -1;
+        }
+        devfs_mk_dir("md");
+        blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
+                                md_probe, NULL, NULL);
+        blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
+                            md_probe, NULL, NULL);
+        for (minor=0; minor < MAX_MD_DEVS; ++minor)
+                devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
+                                S_IFBLK|S_IRUSR|S_IWUSR,
+                                "md/%d", minor);
+        for (minor=0; minor < MAX_MD_DEVS; ++minor)
+                devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
+                              S_IFBLK|S_IRUSR|S_IWUSR,
+                              "md/mdp%d", minor);
+        register_reboot_notifier(&md_notifier);
+        raid_table_header = register_sysctl_table(raid_root_table, 1);
+        md_geninit();
+        return (0);
+}
+#ifndef MODULE
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static dev_t detected_devices[128];
+static int dev_cnt;
+void md_autodetect_dev(dev_t dev)
+{
+        if (dev_cnt >= 0 && dev_cnt < 127)
+                detected_devices[dev_cnt++] = dev;
+}
+static void autostart_arrays(int part)
+{
+        mdk_rdev_t *rdev;
+        int i;
+        printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+        for (i = 0; i < dev_cnt; i++) {
+                dev_t dev = detected_devices[i];
+                rdev = md_import_device(dev,0, 0);
+                if (IS_ERR(rdev))
+                        continue;
+                if (rdev->faulty) {
+                        MD_BUG();
+                        continue;
+                }
+                list_add(&rdev->same_set, &pending_raid_disks);
+        }
+        dev_cnt = 0;
+        autorun_devices(part);
+}
+#endif
+static __exit void md_exit(void)
+{
+        mddev_t *mddev;
+        struct list_head *tmp;
+        int i;
+        blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
+        blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
+        for (i=0; i < MAX_MD_DEVS; i++)
+                devfs_remove("md/%d", i);
+        for (i=0; i < MAX_MD_DEVS; i++)
+                devfs_remove("md/d%d", i);
+        devfs_remove("md");
+        unregister_blkdev(MAJOR_NR,"md");
+        unregister_blkdev(mdp_major, "mdp");
+        unregister_reboot_notifier(&md_notifier);
+        unregister_sysctl_table(raid_table_header);
+        remove_proc_entry("mdstat", NULL);
+        ITERATE_MDDEV(mddev,tmp) {
+                struct gendisk *disk = mddev->gendisk;
+                if (!disk)
+                        continue;
+                export_array(mddev);
+                del_gendisk(disk);
+                put_disk(disk);
+                mddev->gendisk = NULL;
+                mddev_put(mddev);
+        }
+}
+module_init(md_init)
+module_exit(md_exit)
+EXPORT_SYMBOL(register_md_personality);
+EXPORT_SYMBOL(unregister_md_personality);
+EXPORT_SYMBOL(md_error);
+EXPORT_SYMBOL(md_done_sync);
+EXPORT_SYMBOL(md_write_start);
+EXPORT_SYMBOL(md_write_end);
+EXPORT_SYMBOL(md_handle_safemode);
+EXPORT_SYMBOL(md_register_thread);
+EXPORT_SYMBOL(md_unregister_thread);
+EXPORT_SYMBOL(md_wakeup_thread);
+EXPORT_SYMBOL(md_print_devices);
+EXPORT_SYMBOL(md_check_recovery);
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c
new file mode 100644
index 000000000000..adef299908cf
--- /dev/null
+++ b/drivers/md/mktables.c
@@ -0,0 +1,125 @@
+#ident "$Id: mktables.c,v 1.2 2002/12/12 22:41:27 hpa Exp $"
+/* ----------------------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * mktables.c
+ *
+ * Make RAID-6 tables.  This is a host user space program to be run at
+ * compile time.
+ */
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <time.h>
+static uint8_t gfmul(uint8_t a, uint8_t b)
+{
+  uint8_t v = 0;
+  while ( b ) {
+    if ( b & 1 ) v ^= a;
+    a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
+    b >>= 1;
+  }
+  return v;
+}
+static uint8_t gfpow(uint8_t a, int b)
+{
+  uint8_t v = 1;
+  b %= 255;
+  if ( b < 0 )
+    b += 255;
+  while ( b ) {
+    if ( b & 1 ) v = gfmul(v,a);
+    a = gfmul(a,a);
+    b >>= 1;
+  }
+  return v;
+}
+int main(int argc, char *argv[])
+{
+  int i, j, k;
+  uint8_t v;
+  uint8_t exptbl[256], invtbl[256];
+  printf("#include \"raid6.h\"\n");
+  /* Compute multiplication table */
+  printf("\nconst u8  __attribute__((aligned(256)))\n"
+         "raid6_gfmul[256][256] =\n"
+         "{\n");
+  for ( i = 0 ; i < 256 ; i++ ) {
+    printf("\t{\n");
+    for ( j = 0 ; j < 256 ; j += 8 ) {
+      printf("\t\t");
+      for ( k = 0 ; k < 8 ; k++ ) {
+        printf("0x%02x, ", gfmul(i,j+k));
+      }
+      printf("\n");
+    }
+    printf("\t},\n");
+  }
+  printf("};\n");
+  /* Compute power-of-2 table (exponent) */
+  v = 1;
+  printf("\nconst u8 __attribute__((aligned(256)))\n"
+         "raid6_gfexp[256] =\n"
+         "{\n");
+  for ( i = 0 ; i < 256 ; i += 8 ) {
+    printf("\t");
+    for ( j = 0 ; j < 8 ; j++ ) {
+      exptbl[i+j] = v;
+      printf("0x%02x, ", v);
+      v = gfmul(v,2);
+      if ( v == 1 ) v = 0;      /* For entry 255, not a real entry */
+    }
+    printf("\n");
+  }
+  printf("};\n");
+  /* Compute inverse table x^-1 == x^254 */
+  printf("\nconst u8 __attribute__((aligned(256)))\n"
+         "raid6_gfinv[256] =\n"
+         "{\n");
+  for ( i = 0 ; i < 256 ; i += 8 ) {
+    printf("\t");
+    for ( j = 0 ; j < 8 ; j++ ) {
+      invtbl[i+j] = v = gfpow(i+j,254);
+      printf("0x%02x, ", v);
+    }
+    printf("\n");
+  }
+  printf("};\n");
+  /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
+  printf("\nconst u8 __attribute__((aligned(256)))\n"
+         "raid6_gfexi[256] =\n"
+         "{\n");
+  for ( i = 0 ; i < 256 ; i += 8 ) {
+    printf("\t");
+    for ( j = 0 ; j < 8 ; j++ ) {
+      printf("0x%02x, ", invtbl[exptbl[i+j]^1]);
+    }
+    printf("\n");
+  }
+  printf("};\n\n");
+  return 0;
+}
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
new file mode 100644
index 000000000000..c9b134cd1532
--- /dev/null
+++ b/drivers/md/multipath.c
@@ -0,0 +1,584 @@
+/*
+ * multipath.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
+ *
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * MULTIPATH management functions.
+ *
+ * derived from raid1.c.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/raid/multipath.h>
+#include <linux/buffer_head.h>
+#include <asm/atomic.h>
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define MD_PERSONALITY
+#define MAX_WORK_PER_DISK 128
+#define NR_RESERVED_BUFS        32
+static mdk_personality_t multipath_personality;
+static void *mp_pool_alloc(unsigned int __nocast gfp_flags, void *data)
+{
+        struct multipath_bh *mpb;
+        mpb = kmalloc(sizeof(*mpb), gfp_flags);
+        if (mpb) 
+                memset(mpb, 0, sizeof(*mpb));
+        return mpb;
+}
+static void mp_pool_free(void *mpb, void *data)
+{
+        kfree(mpb);
+}
+static int multipath_map (multipath_conf_t *conf)
+{
+        int i, disks = conf->raid_disks;
+        /*
+         * Later we do read balancing on the read side 
+         * now we use the first available disk.
+         */
+        rcu_read_lock();
+        for (i = 0; i < disks; i++) {
+                mdk_rdev_t *rdev = conf->multipaths[i].rdev;
+                if (rdev && rdev->in_sync) {
+                        atomic_inc(&rdev->nr_pending);
+                        rcu_read_unlock();
+                        return i;
+                }
+        }
+        rcu_read_unlock();
+        printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
+        return (-1);
+}
+static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
+{
+        unsigned long flags;
+        mddev_t *mddev = mp_bh->mddev;
+        multipath_conf_t *conf = mddev_to_conf(mddev);
+        spin_lock_irqsave(&conf->device_lock, flags);
+        list_add(&mp_bh->retry_list, &conf->retry_list);
+        spin_unlock_irqrestore(&conf->device_lock, flags);
+        md_wakeup_thread(mddev->thread);
+}
+/*
+ * multipath_end_bh_io() is called when we have finished servicing a multipathed
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
+{
+        struct bio *bio = mp_bh->master_bio;
+        multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
+        bio_endio(bio, bio->bi_size, err);
+        mempool_free(mp_bh, conf->pool);
+}
+int multipath_end_request(struct bio *bio, unsigned int bytes_done, int error)
+{
+        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
+        multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
+        mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
+        if (bio->bi_size)
+                return 1;
+        if (uptodate)
+                multipath_end_bh_io(mp_bh, 0);
+        else if (!bio_rw_ahead(bio)) {
+                /*
+                 * oops, IO error:
+                 */
+                char b[BDEVNAME_SIZE];
+                md_error (mp_bh->mddev, rdev);
+                printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", 
+                       bdevname(rdev->bdev,b), 
+                       (unsigned long long)bio->bi_sector);
+                multipath_reschedule_retry(mp_bh);
+        } else
+                multipath_end_bh_io(mp_bh, error);
+        rdev_dec_pending(rdev, conf->mddev);
+        return 0;
+}
+static void unplug_slaves(mddev_t *mddev)
+{
+        multipath_conf_t *conf = mddev_to_conf(mddev);
+        int i;
+        rcu_read_lock();
+        for (i=0; i<mddev->raid_disks; i++) {
+                mdk_rdev_t *rdev = conf->multipaths[i].rdev;
+                if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
+                        request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+                        atomic_inc(&rdev->nr_pending);
+                        rcu_read_unlock();
+                        if (r_queue->unplug_fn)
+                                r_queue->unplug_fn(r_queue);
+                        rdev_dec_pending(rdev, mddev);
+                        rcu_read_lock();
+                }
+        }
+        rcu_read_unlock();
+}
+static void multipath_unplug(request_queue_t *q)
+{
+        unplug_slaves(q->queuedata);
+}
+static int multipath_make_request (request_queue_t *q, struct bio * bio)
+{
+        mddev_t *mddev = q->queuedata;
+        multipath_conf_t *conf = mddev_to_conf(mddev);
+        struct multipath_bh * mp_bh;
+        struct multipath_info *multipath;
+        mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
+        mp_bh->master_bio = bio;
+        mp_bh->mddev = mddev;
+        if (bio_data_dir(bio)==WRITE) {
+                disk_stat_inc(mddev->gendisk, writes);
+                disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
+        } else {
+                disk_stat_inc(mddev->gendisk, reads);
+                disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
+        }
+        mp_bh->path = multipath_map(conf);
+        if (mp_bh->path < 0) {
+                bio_endio(bio, bio->bi_size, -EIO);
+                mempool_free(mp_bh, conf->pool);
+                return 0;
+        }
+        multipath = conf->multipaths + mp_bh->path;
+        mp_bh->bio = *bio;
+        mp_bh->bio.bi_sector += multipath->rdev->data_offset;
+        mp_bh->bio.bi_bdev = multipath->rdev->bdev;
+        mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST);
+        mp_bh->bio.bi_end_io = multipath_end_request;
+        mp_bh->bio.bi_private = mp_bh;
+        generic_make_request(&mp_bh->bio);
+        return 0;
+}
+static void multipath_status (struct seq_file *seq, mddev_t *mddev)
+{
+        multipath_conf_t *conf = mddev_to_conf(mddev);
+        int i;
+        
+        seq_printf (seq, " [%d/%d] [", conf->raid_disks,
+                                                 conf->working_disks);
+        for (i = 0; i < conf->raid_disks; i++)
+                seq_printf (seq, "%s",
+                               conf->multipaths[i].rdev && 
+                               conf->multipaths[i].rdev->in_sync ? "U" : "_");
+        seq_printf (seq, "]");
+}
+static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
+                                 sector_t *error_sector)
+{
+        mddev_t *mddev = q->queuedata;
+        multipath_conf_t *conf = mddev_to_conf(mddev);
+        int i, ret = 0;
+        rcu_read_lock();
+        for (i=0; i<mddev->raid_disks && ret == 0; i++) {
+                mdk_rdev_t *rdev = conf->multipaths[i].rdev;
+                if (rdev && !rdev->faulty) {
+                        struct block_device *bdev = rdev->bdev;
+                        request_queue_t *r_queue = bdev_get_queue(bdev);
+                        if (!r_queue->issue_flush_fn)
+                                ret = -EOPNOTSUPP;
+                        else {
+                                atomic_inc(&rdev->nr_pending);
+                                rcu_read_unlock();
+                                ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
+                                                              error_sector);
+                                rdev_dec_pending(rdev, mddev);
+                                rcu_read_lock();
+                        }
+                }
+        }
+        rcu_read_unlock();
+        return ret;
+}
+/*
+ * Careful, this can execute in IRQ contexts as well!
+ */
+static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        multipath_conf_t *conf = mddev_to_conf(mddev);
+        if (conf->working_disks <= 1) {
+                /*
+                 * Uh oh, we can do nothing if this is our last path, but
+                 * first check if this is a queued request for a device
+                 * which has just failed.
+                 */
+                printk(KERN_ALERT 
+                        "multipath: only one IO path left and IO error.\n");
+                /* leave it active... it's all we have */
+        } else {
+                /*
+                 * Mark disk as unusable
+                 */
+                if (!rdev->faulty) {
+                        char b[BDEVNAME_SIZE];
+                        rdev->in_sync = 0;
+                        rdev->faulty = 1;
+                        mddev->sb_dirty = 1;
+                        conf->working_disks--;
+                        printk(KERN_ALERT "multipath: IO failure on %s,"
+                                " disabling IO path. \n Operation continuing"
+                                " on %d IO paths.\n",
+                                bdevname (rdev->bdev,b),
+                                conf->working_disks);
+                }
+        }
+}
+static void print_multipath_conf (multipath_conf_t *conf)
+{
+        int i;
+        struct multipath_info *tmp;
+        printk("MULTIPATH conf printout:\n");
+        if (!conf) {
+                printk("(conf==NULL)\n");
+                return;
+        }
+        printk(" --- wd:%d rd:%d\n", conf->working_disks,
+                         conf->raid_disks);
+        for (i = 0; i < conf->raid_disks; i++) {
+                char b[BDEVNAME_SIZE];
+                tmp = conf->multipaths + i;
+                if (tmp->rdev)
+                        printk(" disk%d, o:%d, dev:%s\n",
+                                i,!tmp->rdev->faulty,
+                               bdevname(tmp->rdev->bdev,b));
+        }
+}
+static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        multipath_conf_t *conf = mddev->private;
+        int found = 0;
+        int path;
+        struct multipath_info *p;
+        print_multipath_conf(conf);
+        for (path=0; path<mddev->raid_disks; path++) 
+                if ((p=conf->multipaths+path)->rdev == NULL) {
+                        blk_queue_stack_limits(mddev->queue,
+                                               rdev->bdev->bd_disk->queue);
+                /* as we don't honour merge_bvec_fn, we must never risk
+                 * violating it, so limit ->max_sector to one PAGE, as
+                 * a one page request is never in violation.
+                 * (Note: it is very unlikely that a device with
+                 * merge_bvec_fn will be involved in multipath.)
+                 */
+                        if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+                            mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                                blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+                        conf->working_disks++;
+                        rdev->raid_disk = path;
+                        rdev->in_sync = 1;
+                        p->rdev = rdev;
+                        found = 1;
+                }
+        print_multipath_conf(conf);
+        return found;
+}
+static int multipath_remove_disk(mddev_t *mddev, int number)
+{
+        multipath_conf_t *conf = mddev->private;
+        int err = 0;
+        mdk_rdev_t *rdev;
+        struct multipath_info *p = conf->multipaths + number;
+        print_multipath_conf(conf);
+        rdev = p->rdev;
+        if (rdev) {
+                if (rdev->in_sync ||
+                    atomic_read(&rdev->nr_pending)) {
+                        printk(KERN_ERR "hot-remove-disk, slot %d is identified"                                " but is still operational!\n", number);
+                        err = -EBUSY;
+                        goto abort;
+                }
+                p->rdev = NULL;
+                synchronize_kernel();
+                if (atomic_read(&rdev->nr_pending)) {
+                        /* lost the race, try later */
+                        err = -EBUSY;
+                        p->rdev = rdev;
+                }
+        }
+abort:
+        print_multipath_conf(conf);
+        return err;
+}
+/*
+ * This is a kernel thread which:
+ *
+ *      1.      Retries failed read operations on working multipaths.
+ *      2.      Updates the raid superblock when problems encounter.
+ *      3.      Performs writes following reads for array syncronising.
+ */
+static void multipathd (mddev_t *mddev)
+{
+        struct multipath_bh *mp_bh;
+        struct bio *bio;
+        unsigned long flags;
+        multipath_conf_t *conf = mddev_to_conf(mddev);
+        struct list_head *head = &conf->retry_list;
+        md_check_recovery(mddev);
+        for (;;) {
+                char b[BDEVNAME_SIZE];
+                spin_lock_irqsave(&conf->device_lock, flags);
+                if (list_empty(head))
+                        break;
+                mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
+                list_del(head->prev);
+                spin_unlock_irqrestore(&conf->device_lock, flags);
+                bio = &mp_bh->bio;
+                bio->bi_sector = mp_bh->master_bio->bi_sector;
+                
+                if ((mp_bh->path = multipath_map (conf))<0) {
+                        printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
+                                " error for block %llu\n",
+                                bdevname(bio->bi_bdev,b),
+                                (unsigned long long)bio->bi_sector);
+                        multipath_end_bh_io(mp_bh, -EIO);
+                } else {
+                        printk(KERN_ERR "multipath: %s: redirecting sector %llu"
+                                " to another IO path\n",
+                                bdevname(bio->bi_bdev,b),
+                                (unsigned long long)bio->bi_sector);
+                        *bio = *(mp_bh->master_bio);
+                        bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset;
+                        bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
+                        bio->bi_rw |= (1 << BIO_RW_FAILFAST);
+                        bio->bi_end_io = multipath_end_request;
+                        bio->bi_private = mp_bh;
+                        generic_make_request(bio);
+                }
+        }
+        spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+static int multipath_run (mddev_t *mddev)
+{
+        multipath_conf_t *conf;
+        int disk_idx;
+        struct multipath_info *disk;
+        mdk_rdev_t *rdev;
+        struct list_head *tmp;
+        if (mddev->level != LEVEL_MULTIPATH) {
+                printk("multipath: %s: raid level not set to multipath IO (%d)\n",
+                       mdname(mddev), mddev->level);
+                goto out;
+        }
+        /*
+         * copy the already verified devices into our private MULTIPATH
+         * bookkeeping area. [whatever we allocate in multipath_run(),
+         * should be freed in multipath_stop()]
+         */
+        conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL);
+        mddev->private = conf;
+        if (!conf) {
+                printk(KERN_ERR 
+                        "multipath: couldn't allocate memory for %s\n",
+                        mdname(mddev));
+                goto out;
+        }
+        memset(conf, 0, sizeof(*conf));
+        conf->multipaths = kmalloc(sizeof(struct multipath_info)*mddev->raid_disks,
+                                   GFP_KERNEL);
+        if (!conf->multipaths) {
+                printk(KERN_ERR 
+                        "multipath: couldn't allocate memory for %s\n",
+                        mdname(mddev));
+                goto out_free_conf;
+        }
+        memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks);
+        mddev->queue->unplug_fn = multipath_unplug;
+        mddev->queue->issue_flush_fn = multipath_issue_flush;
+        conf->working_disks = 0;
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                disk_idx = rdev->raid_disk;
+                if (disk_idx < 0 ||
+                    disk_idx >= mddev->raid_disks)
+                        continue;
+                disk = conf->multipaths + disk_idx;
+                disk->rdev = rdev;
+                blk_queue_stack_limits(mddev->queue,
+                                       rdev->bdev->bd_disk->queue);
+                /* as we don't honour merge_bvec_fn, we must never risk
+                 * violating it, not that we ever expect a device with
+                 * a merge_bvec_fn to be involved in multipath */
+                if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+                if (!rdev->faulty) 
+                        conf->working_disks++;
+        }
+        conf->raid_disks = mddev->raid_disks;
+        mddev->sb_dirty = 1;
+        conf->mddev = mddev;
+        spin_lock_init(&conf->device_lock);
+        INIT_LIST_HEAD(&conf->retry_list);
+        if (!conf->working_disks) {
+                printk(KERN_ERR "multipath: no operational IO paths for %s\n",
+                        mdname(mddev));
+                goto out_free_conf;
+        }
+        mddev->degraded = conf->raid_disks = conf->working_disks;
+        conf->pool = mempool_create(NR_RESERVED_BUFS,
+                                    mp_pool_alloc, mp_pool_free,
+                                    NULL);
+        if (conf->pool == NULL) {
+                printk(KERN_ERR 
+                        "multipath: couldn't allocate memory for %s\n",
+                        mdname(mddev));
+                goto out_free_conf;
+        }
+        {
+                mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath");
+                if (!mddev->thread) {
+                        printk(KERN_ERR "multipath: couldn't allocate thread"
+                                " for %s\n", mdname(mddev));
+                        goto out_free_conf;
+                }
+        }
+        printk(KERN_INFO 
+                "multipath: array %s active with %d out of %d IO paths\n",
+                mdname(mddev), conf->working_disks, mddev->raid_disks);
+        /*
+         * Ok, everything is just fine now
+         */
+        mddev->array_size = mddev->size;
+        return 0;
+out_free_conf:
+        if (conf->pool)
+                mempool_destroy(conf->pool);
+        if (conf->multipaths)
+                kfree(conf->multipaths);
+        kfree(conf);
+        mddev->private = NULL;
+out:
+        return -EIO;
+}
+static int multipath_stop (mddev_t *mddev)
+{
+        multipath_conf_t *conf = mddev_to_conf(mddev);
+        md_unregister_thread(mddev->thread);
+        mddev->thread = NULL;
+        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+        mempool_destroy(conf->pool);
+        kfree(conf->multipaths);
+        kfree(conf);
+        mddev->private = NULL;
+        return 0;
+}
+static mdk_personality_t multipath_personality=
+{
+        .name           = "multipath",
+        .owner          = THIS_MODULE,
+        .make_request   = multipath_make_request,
+        .run            = multipath_run,
+        .stop           = multipath_stop,
+        .status         = multipath_status,
+        .error_handler  = multipath_error,
+        .hot_add_disk   = multipath_add_disk,
+        .hot_remove_disk= multipath_remove_disk,
+};
+static int __init multipath_init (void)
+{
+        return register_md_personality (MULTIPATH, &multipath_personality);
+}
+static void __exit multipath_exit (void)
+{
+        unregister_md_personality (MULTIPATH);
+}
+module_init(multipath_init);
+module_exit(multipath_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
new file mode 100644
index 000000000000..e7d934eca06f
--- /dev/null
+++ b/drivers/md/raid0.c
@@ -0,0 +1,539 @@
+/*
+   raid0.c : Multiple Devices driver for Linux
+             Copyright (C) 1994-96 Marc ZYNGIER
+             <zyngier@ufr-info-p7.ibp.fr> or
+             <maz@gloups.fdn.fr>
+             Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
+   RAID-0 management functions.
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+#include <linux/module.h>
+#include <linux/raid/raid0.h>
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define MD_PERSONALITY
+static void raid0_unplug(request_queue_t *q)
+{
+        mddev_t *mddev = q->queuedata;
+        raid0_conf_t *conf = mddev_to_conf(mddev);
+        mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+        int i;
+        for (i=0; i<mddev->raid_disks; i++) {
+                request_queue_t *r_queue = bdev_get_queue(devlist[i]->bdev);
+                if (r_queue->unplug_fn)
+                        r_queue->unplug_fn(r_queue);
+        }
+}
+static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk,
+                             sector_t *error_sector)
+{
+        mddev_t *mddev = q->queuedata;
+        raid0_conf_t *conf = mddev_to_conf(mddev);
+        mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+        int i, ret = 0;
+        for (i=0; i<mddev->raid_disks && ret == 0; i++) {
+                struct block_device *bdev = devlist[i]->bdev;
+                request_queue_t *r_queue = bdev_get_queue(bdev);
+                if (!r_queue->issue_flush_fn)
+                        ret = -EOPNOTSUPP;
+                else
+                        ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+        }
+        return ret;
+}
+static int create_strip_zones (mddev_t *mddev)
+{
+        int i, c, j;
+        sector_t current_offset, curr_zone_offset;
+        sector_t min_spacing;
+        raid0_conf_t *conf = mddev_to_conf(mddev);
+        mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
+        struct list_head *tmp1, *tmp2;
+        struct strip_zone *zone;
+        int cnt;
+        char b[BDEVNAME_SIZE];
+ 
+        /*
+         * The number of 'same size groups'
+         */
+        conf->nr_strip_zones = 0;
+ 
+        ITERATE_RDEV(mddev,rdev1,tmp1) {
+                printk("raid0: looking at %s\n",
+                        bdevname(rdev1->bdev,b));
+                c = 0;
+                ITERATE_RDEV(mddev,rdev2,tmp2) {
+                        printk("raid0:   comparing %s(%llu)",
+                               bdevname(rdev1->bdev,b),
+                               (unsigned long long)rdev1->size);
+                        printk(" with %s(%llu)\n",
+                               bdevname(rdev2->bdev,b),
+                               (unsigned long long)rdev2->size);
+                        if (rdev2 == rdev1) {
+                                printk("raid0:   END\n");
+                                break;
+                        }
+                        if (rdev2->size == rdev1->size)
+                        {
+                                /*
+                                 * Not unique, don't count it as a new
+                                 * group
+                                 */
+                                printk("raid0:   EQUAL\n");
+                                c = 1;
+                                break;
+                        }
+                        printk("raid0:   NOT EQUAL\n");
+                }
+                if (!c) {
+                        printk("raid0:   ==> UNIQUE\n");
+                        conf->nr_strip_zones++;
+                        printk("raid0: %d zones\n", conf->nr_strip_zones);
+                }
+        }
+        printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
+        conf->strip_zone = kmalloc(sizeof(struct strip_zone)*
+                                conf->nr_strip_zones, GFP_KERNEL);
+        if (!conf->strip_zone)
+                return 1;
+        conf->devlist = kmalloc(sizeof(mdk_rdev_t*)*
+                                conf->nr_strip_zones*mddev->raid_disks,
+                                GFP_KERNEL);
+        if (!conf->devlist)
+                return 1;
+        memset(conf->strip_zone, 0,sizeof(struct strip_zone)*
+                                   conf->nr_strip_zones);
+        memset(conf->devlist, 0,
+               sizeof(mdk_rdev_t*) * conf->nr_strip_zones * mddev->raid_disks);
+        /* The first zone must contain all devices, so here we check that
+         * there is a proper alignment of slots to devices and find them all
+         */
+        zone = &conf->strip_zone[0];
+        cnt = 0;
+        smallest = NULL;
+        zone->dev = conf->devlist;
+        ITERATE_RDEV(mddev, rdev1, tmp1) {
+                int j = rdev1->raid_disk;
+                if (j < 0 || j >= mddev->raid_disks) {
+                        printk("raid0: bad disk number %d - aborting!\n", j);
+                        goto abort;
+                }
+                if (zone->dev[j]) {
+                        printk("raid0: multiple devices for %d - aborting!\n",
+                                j);
+                        goto abort;
+                }
+                zone->dev[j] = rdev1;
+                blk_queue_stack_limits(mddev->queue,
+                                       rdev1->bdev->bd_disk->queue);
+                /* as we don't honour merge_bvec_fn, we must never risk
+                 * violating it, so limit ->max_sector to one PAGE, as
+                 * a one page request is never in violation.
+                 */
+                if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
+                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+                if (!smallest || (rdev1->size <smallest->size))
+                        smallest = rdev1;
+                cnt++;
+        }
+        if (cnt != mddev->raid_disks) {
+                printk("raid0: too few disks (%d of %d) - aborting!\n",
+                        cnt, mddev->raid_disks);
+                goto abort;
+        }
+        zone->nb_dev = cnt;
+        zone->size = smallest->size * cnt;
+        zone->zone_offset = 0;
+        current_offset = smallest->size;
+        curr_zone_offset = zone->size;
+        /* now do the other zones */
+        for (i = 1; i < conf->nr_strip_zones; i++)
+        {
+                zone = conf->strip_zone + i;
+                zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
+                printk("raid0: zone %d\n", i);
+                zone->dev_offset = current_offset;
+                smallest = NULL;
+                c = 0;
+                for (j=0; j<cnt; j++) {
+                        char b[BDEVNAME_SIZE];
+                        rdev = conf->strip_zone[0].dev[j];
+                        printk("raid0: checking %s ...", bdevname(rdev->bdev,b));
+                        if (rdev->size > current_offset)
+                        {
+                                printk(" contained as device %d\n", c);
+                                zone->dev[c] = rdev;
+                                c++;
+                                if (!smallest || (rdev->size <smallest->size)) {
+                                        smallest = rdev;
+                                        printk("  (%llu) is smallest!.\n", 
+                                                (unsigned long long)rdev->size);
+                                }
+                        } else
+                                printk(" nope.\n");
+                }
+                zone->nb_dev = c;
+                zone->size = (smallest->size - current_offset) * c;
+                printk("raid0: zone->nb_dev: %d, size: %llu\n",
+                        zone->nb_dev, (unsigned long long)zone->size);
+                zone->zone_offset = curr_zone_offset;
+                curr_zone_offset += zone->size;
+                current_offset = smallest->size;
+                printk("raid0: current zone offset: %llu\n",
+                        (unsigned long long)current_offset);
+        }
+        /* Now find appropriate hash spacing.
+         * We want a number which causes most hash entries to cover
+         * at most two strips, but the hash table must be at most
+         * 1 PAGE.  We choose the smallest strip, or contiguous collection
+         * of strips, that has big enough size.  We never consider the last
+         * strip though as it's size has no bearing on the efficacy of the hash
+         * table.
+         */
+        conf->hash_spacing = curr_zone_offset;
+        min_spacing = curr_zone_offset;
+        sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
+        for (i=0; i < conf->nr_strip_zones-1; i++) {
+                sector_t sz = 0;
+                for (j=i; j<conf->nr_strip_zones-1 &&
+                             sz < min_spacing ; j++)
+                        sz += conf->strip_zone[j].size;
+                if (sz >= min_spacing && sz < conf->hash_spacing)
+                        conf->hash_spacing = sz;
+        }
+        mddev->queue->unplug_fn = raid0_unplug;
+        mddev->queue->issue_flush_fn = raid0_issue_flush;
+        printk("raid0: done.\n");
+        return 0;
+ abort:
+        return 1;
+}
+/**
+ *      raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
+ *      @q: request queue
+ *      @bio: the buffer head that's been built up so far
+ *      @biovec: the request that could be merged to it.
+ *
+ *      Return amount of bytes we can accept at this offset
+ */
+static int raid0_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
+{
+        mddev_t *mddev = q->queuedata;
+        sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+        int max;
+        unsigned int chunk_sectors = mddev->chunk_size >> 9;
+        unsigned int bio_sectors = bio->bi_size >> 9;
+        max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
+        if (max < 0) max = 0; /* bio_add cannot handle a negative return */
+        if (max <= biovec->bv_len && bio_sectors == 0)
+                return biovec->bv_len;
+        else 
+                return max;
+}
+static int raid0_run (mddev_t *mddev)
+{
+        unsigned  cur=0, i=0, nb_zone;
+        s64 size;
+        raid0_conf_t *conf;
+        mdk_rdev_t *rdev;
+        struct list_head *tmp;
+        printk("%s: setting max_sectors to %d, segment boundary to %d\n",
+               mdname(mddev),
+               mddev->chunk_size >> 9,
+               (mddev->chunk_size>>1)-1);
+        blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
+        blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
+        conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL);
+        if (!conf)
+                goto out;
+        mddev->private = (void *)conf;
+ 
+        conf->strip_zone = NULL;
+        conf->devlist = NULL;
+        if (create_strip_zones (mddev)) 
+                goto out_free_conf;
+        /* calculate array device size */
+        mddev->array_size = 0;
+        ITERATE_RDEV(mddev,rdev,tmp)
+                mddev->array_size += rdev->size;
+        printk("raid0 : md_size is %llu blocks.\n", 
+                (unsigned long long)mddev->array_size);
+        printk("raid0 : conf->hash_spacing is %llu blocks.\n",
+                (unsigned long long)conf->hash_spacing);
+        {
+#if __GNUC__ < 3
+                volatile
+#endif
+                sector_t s = mddev->array_size;
+                sector_t space = conf->hash_spacing;
+                int round;
+                conf->preshift = 0;
+                if (sizeof(sector_t) > sizeof(unsigned long)) {
+                        /*shift down space and s so that sector_div will work */
+                        while (space > (sector_t) (~(unsigned long)0)) {
+                                s >>= 1;
+                                space >>= 1;
+                                s += 1; /* force round-up */
+                                conf->preshift++;
+                        }
+                }
+                round = sector_div(s, (unsigned long)space) ? 1 : 0;
+                nb_zone = s + round;
+        }
+        printk("raid0 : nb_zone is %d.\n", nb_zone);
+        printk("raid0 : Allocating %Zd bytes for hash.\n",
+                                nb_zone*sizeof(struct strip_zone*));
+        conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
+        if (!conf->hash_table)
+                goto out_free_conf;
+        size = conf->strip_zone[cur].size;
+        for (i=0; i< nb_zone; i++) {
+                conf->hash_table[i] = conf->strip_zone + cur;
+                while (size <= conf->hash_spacing) {
+                        cur++;
+                        size += conf->strip_zone[cur].size;
+                }
+                size -= conf->hash_spacing;
+        }
+        if (conf->preshift) {
+                conf->hash_spacing >>= conf->preshift;
+                /* round hash_spacing up so when we divide by it, we
+                 * err on the side of too-low, which is safest
+                 */
+                conf->hash_spacing++;
+        }
+        /* calculate the max read-ahead size.
+         * For read-ahead of large files to be effective, we need to
+         * readahead at least twice a whole stripe. i.e. number of devices
+         * multiplied by chunk size times 2.
+         * If an individual device has an ra_pages greater than the
+         * chunk size, then we will not drive that device as hard as it
+         * wants.  We consider this a configuration error: a larger
+         * chunksize should be used in that case.
+         */
+        {
+                int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE;
+                if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
+                        mddev->queue->backing_dev_info.ra_pages = 2* stripe;
+        }
+        blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
+        return 0;
+out_free_conf:
+        if (conf->strip_zone)
+                kfree(conf->strip_zone);
+        if (conf->devlist)
+                kfree (conf->devlist);
+        kfree(conf);
+        mddev->private = NULL;
+out:
+        return 1;
+}
+static int raid0_stop (mddev_t *mddev)
+{
+        raid0_conf_t *conf = mddev_to_conf(mddev);
+        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+        kfree (conf->hash_table);
+        conf->hash_table = NULL;
+        kfree (conf->strip_zone);
+        conf->strip_zone = NULL;
+        kfree (conf);
+        mddev->private = NULL;
+        return 0;
+}
+static int raid0_make_request (request_queue_t *q, struct bio *bio)
+{
+        mddev_t *mddev = q->queuedata;
+        unsigned int sect_in_chunk, chunksize_bits,  chunk_size, chunk_sects;
+        raid0_conf_t *conf = mddev_to_conf(mddev);
+        struct strip_zone *zone;
+        mdk_rdev_t *tmp_dev;
+        unsigned long chunk;
+        sector_t block, rsect;
+        if (bio_data_dir(bio)==WRITE) {
+                disk_stat_inc(mddev->gendisk, writes);
+                disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
+        } else {
+                disk_stat_inc(mddev->gendisk, reads);
+                disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
+        }
+        chunk_size = mddev->chunk_size >> 10;
+        chunk_sects = mddev->chunk_size >> 9;
+        chunksize_bits = ffz(~chunk_size);
+        block = bio->bi_sector >> 1;
+        
+        if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) {
+                struct bio_pair *bp;
+                /* Sanity check -- queue functions should prevent this happening */
+                if (bio->bi_vcnt != 1 ||
+                    bio->bi_idx != 0)
+                        goto bad_map;
+                /* This is a one page bio that upper layers
+                 * refuse to split for us, so we need to split it.
+                 */
+                bp = bio_split(bio, bio_split_pool, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
+                if (raid0_make_request(q, &bp->bio1))
+                        generic_make_request(&bp->bio1);
+                if (raid0_make_request(q, &bp->bio2))
+                        generic_make_request(&bp->bio2);
+                bio_pair_release(bp);
+                return 0;
+        }
+ 
+        {
+#if __GNUC__ < 3
+                volatile
+#endif
+                sector_t x = block >> conf->preshift;
+                sector_div(x, (unsigned long)conf->hash_spacing);
+                zone = conf->hash_table[x];
+        }
+ 
+        while (block >= (zone->zone_offset + zone->size)) 
+                zone++;
+    
+        sect_in_chunk = bio->bi_sector & ((chunk_size<<1) -1);
+        {
+                sector_t x =  (block - zone->zone_offset) >> chunksize_bits;
+                sector_div(x, zone->nb_dev);
+                chunk = x;
+                BUG_ON(x != (sector_t)chunk);
+                x = block >> chunksize_bits;
+                tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
+        }
+        rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1)
+                + sect_in_chunk;
+ 
+        bio->bi_bdev = tmp_dev->bdev;
+        bio->bi_sector = rsect + tmp_dev->data_offset;
+        /*
+         * Let the main block layer submit the IO and resolve recursion:
+         */
+        return 1;
+bad_map:
+        printk("raid0_make_request bug: can't convert block across chunks"
+                " or bigger than %dk %llu %d\n", chunk_size, 
+                (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+        bio_io_error(bio, bio->bi_size);
+        return 0;
+}
+                           
+static void raid0_status (struct seq_file *seq, mddev_t *mddev)
+{
+#undef MD_DEBUG
+#ifdef MD_DEBUG
+        int j, k, h;
+        char b[BDEVNAME_SIZE];
+        raid0_conf_t *conf = mddev_to_conf(mddev);
+  
+        h = 0;
+        for (j = 0; j < conf->nr_strip_zones; j++) {
+                seq_printf(seq, "      z%d", j);
+                if (conf->hash_table[h] == conf->strip_zone+j)
+                        seq_printf("(h%d)", h++);
+                seq_printf(seq, "=[");
+                for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+                        seq_printf (seq, "%s/", bdevname(
+                                conf->strip_zone[j].dev[k]->bdev,b));
+                seq_printf (seq, "] zo=%d do=%d s=%d\n",
+                                conf->strip_zone[j].zone_offset,
+                                conf->strip_zone[j].dev_offset,
+                                conf->strip_zone[j].size);
+        }
+#endif
+        seq_printf(seq, " %dk chunks", mddev->chunk_size/1024);
+        return;
+}
+static mdk_personality_t raid0_personality=
+{
+        .name           = "raid0",
+        .owner          = THIS_MODULE,
+        .make_request   = raid0_make_request,
+        .run            = raid0_run,
+        .stop           = raid0_stop,
+        .status         = raid0_status,
+};
+static int __init raid0_init (void)
+{
+        return register_md_personality (RAID0, &raid0_personality);
+}
+static void raid0_exit (void)
+{
+        unregister_md_personality (RAID0);
+}
+module_init(raid0_init);
+module_exit(raid0_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-2"); /* RAID0 */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
new file mode 100644
index 000000000000..a389394b52f6
--- /dev/null
+++ b/drivers/md/raid1.c
@@ -0,0 +1,1449 @@
+/*
+ * raid1.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
+ *
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * RAID-1 management functions.
+ *
+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
+ *
+ * Fixes to reconstruction by Jakob �stergaard" <jakob@ostenfeld.dk>
+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/raid/raid1.h>
+/*
+ * Number of guaranteed r1bios in case of extreme VM load:
+ */
+#define NR_RAID1_BIOS 256
+static mdk_personality_t raid1_personality;
+static void unplug_slaves(mddev_t *mddev);
+static void * r1bio_pool_alloc(unsigned int __nocast gfp_flags, void *data)
+{
+        struct pool_info *pi = data;
+        r1bio_t *r1_bio;
+        int size = offsetof(r1bio_t, bios[pi->raid_disks]);
+        /* allocate a r1bio with room for raid_disks entries in the bios array */
+        r1_bio = kmalloc(size, gfp_flags);
+        if (r1_bio)
+                memset(r1_bio, 0, size);
+        else
+                unplug_slaves(pi->mddev);
+        return r1_bio;
+}
+static void r1bio_pool_free(void *r1_bio, void *data)
+{
+        kfree(r1_bio);
+}
+#define RESYNC_BLOCK_SIZE (64*1024)
+//#define RESYNC_BLOCK_SIZE PAGE_SIZE
+#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+#define RESYNC_WINDOW (2048*1024)
+static void * r1buf_pool_alloc(unsigned int __nocast gfp_flags, void *data)
+{
+        struct pool_info *pi = data;
+        struct page *page;
+        r1bio_t *r1_bio;
+        struct bio *bio;
+        int i, j;
+        r1_bio = r1bio_pool_alloc(gfp_flags, pi);
+        if (!r1_bio) {
+                unplug_slaves(pi->mddev);
+                return NULL;
+        }
+        /*
+         * Allocate bios : 1 for reading, n-1 for writing
+         */
+        for (j = pi->raid_disks ; j-- ; ) {
+                bio = bio_alloc(gfp_flags, RESYNC_PAGES);
+                if (!bio)
+                        goto out_free_bio;
+                r1_bio->bios[j] = bio;
+        }
+        /*
+         * Allocate RESYNC_PAGES data pages and attach them to
+         * the first bio;
+         */
+        bio = r1_bio->bios[0];
+        for (i = 0; i < RESYNC_PAGES; i++) {
+                page = alloc_page(gfp_flags);
+                if (unlikely(!page))
+                        goto out_free_pages;
+                bio->bi_io_vec[i].bv_page = page;
+        }
+        r1_bio->master_bio = NULL;
+        return r1_bio;
+out_free_pages:
+        for ( ; i > 0 ; i--)
+                __free_page(bio->bi_io_vec[i-1].bv_page);
+out_free_bio:
+        while ( ++j < pi->raid_disks )
+                bio_put(r1_bio->bios[j]);
+        r1bio_pool_free(r1_bio, data);
+        return NULL;
+}
+static void r1buf_pool_free(void *__r1_bio, void *data)
+{
+        struct pool_info *pi = data;
+        int i;
+        r1bio_t *r1bio = __r1_bio;
+        struct bio *bio = r1bio->bios[0];
+        for (i = 0; i < RESYNC_PAGES; i++) {
+                __free_page(bio->bi_io_vec[i].bv_page);
+                bio->bi_io_vec[i].bv_page = NULL;
+        }
+        for (i=0 ; i < pi->raid_disks; i++)
+                bio_put(r1bio->bios[i]);
+        r1bio_pool_free(r1bio, data);
+}
+static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
+{
+        int i;
+        for (i = 0; i < conf->raid_disks; i++) {
+                struct bio **bio = r1_bio->bios + i;
+                if (*bio)
+                        bio_put(*bio);
+                *bio = NULL;
+        }
+}
+static inline void free_r1bio(r1bio_t *r1_bio)
+{
+        unsigned long flags;
+        conf_t *conf = mddev_to_conf(r1_bio->mddev);
+        /*
+         * Wake up any possible resync thread that waits for the device
+         * to go idle.
+         */
+        spin_lock_irqsave(&conf->resync_lock, flags);
+        if (!--conf->nr_pending) {
+                wake_up(&conf->wait_idle);
+                wake_up(&conf->wait_resume);
+        }
+        spin_unlock_irqrestore(&conf->resync_lock, flags);
+        put_all_bios(conf, r1_bio);
+        mempool_free(r1_bio, conf->r1bio_pool);
+}
+static inline void put_buf(r1bio_t *r1_bio)
+{
+        conf_t *conf = mddev_to_conf(r1_bio->mddev);
+        unsigned long flags;
+        mempool_free(r1_bio, conf->r1buf_pool);
+        spin_lock_irqsave(&conf->resync_lock, flags);
+        if (!conf->barrier)
+                BUG();
+        --conf->barrier;
+        wake_up(&conf->wait_resume);
+        wake_up(&conf->wait_idle);
+        if (!--conf->nr_pending) {
+                wake_up(&conf->wait_idle);
+                wake_up(&conf->wait_resume);
+        }
+        spin_unlock_irqrestore(&conf->resync_lock, flags);
+}
+static void reschedule_retry(r1bio_t *r1_bio)
+{
+        unsigned long flags;
+        mddev_t *mddev = r1_bio->mddev;
+        conf_t *conf = mddev_to_conf(mddev);
+        spin_lock_irqsave(&conf->device_lock, flags);
+        list_add(&r1_bio->retry_list, &conf->retry_list);
+        spin_unlock_irqrestore(&conf->device_lock, flags);
+        md_wakeup_thread(mddev->thread);
+}
+/*
+ * raid_end_bio_io() is called when we have finished servicing a mirrored
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void raid_end_bio_io(r1bio_t *r1_bio)
+{
+        struct bio *bio = r1_bio->master_bio;
+        bio_endio(bio, bio->bi_size,
+                test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
+        free_r1bio(r1_bio);
+}
+/*
+ * Update disk head position estimator based on IRQ completion info.
+ */
+static inline void update_head_pos(int disk, r1bio_t *r1_bio)
+{
+        conf_t *conf = mddev_to_conf(r1_bio->mddev);
+        conf->mirrors[disk].head_position =
+                r1_bio->sector + (r1_bio->sectors);
+}
+static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
+{
+        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+        int mirror;
+        conf_t *conf = mddev_to_conf(r1_bio->mddev);
+        if (bio->bi_size)
+                return 1;
+        
+        mirror = r1_bio->read_disk;
+        /*
+         * this branch is our 'one mirror IO has finished' event handler:
+         */
+        if (!uptodate)
+                md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+        else
+                /*
+                 * Set R1BIO_Uptodate in our master bio, so that
+                 * we will return a good error code for to the higher
+                 * levels even if IO on some other mirrored buffer fails.
+                 *
+                 * The 'master' represents the composite IO operation to
+                 * user-side. So if something waits for IO, then it will
+                 * wait for the 'master' bio.
+                 */
+                set_bit(R1BIO_Uptodate, &r1_bio->state);
+        update_head_pos(mirror, r1_bio);
+        /*
+         * we have only one bio on the read side
+         */
+        if (uptodate)
+                raid_end_bio_io(r1_bio);
+        else {
+                /*
+                 * oops, read error:
+                 */
+                char b[BDEVNAME_SIZE];
+                if (printk_ratelimit())
+                        printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
+                               bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
+                reschedule_retry(r1_bio);
+        }
+        rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+        return 0;
+}
+static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
+{
+        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+        int mirror;
+        conf_t *conf = mddev_to_conf(r1_bio->mddev);
+        if (bio->bi_size)
+                return 1;
+        for (mirror = 0; mirror < conf->raid_disks; mirror++)
+                if (r1_bio->bios[mirror] == bio)
+                        break;
+        /*
+         * this branch is our 'one mirror IO has finished' event handler:
+         */
+        if (!uptodate)
+                md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+        else
+                /*
+                 * Set R1BIO_Uptodate in our master bio, so that
+                 * we will return a good error code for to the higher
+                 * levels even if IO on some other mirrored buffer fails.
+                 *
+                 * The 'master' represents the composite IO operation to
+                 * user-side. So if something waits for IO, then it will
+                 * wait for the 'master' bio.
+                 */
+                set_bit(R1BIO_Uptodate, &r1_bio->state);
+        update_head_pos(mirror, r1_bio);
+        /*
+         *
+         * Let's see if all mirrored write operations have finished
+         * already.
+         */
+        if (atomic_dec_and_test(&r1_bio->remaining)) {
+                md_write_end(r1_bio->mddev);
+                raid_end_bio_io(r1_bio);
+        }
+        rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+        return 0;
+}
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. There is a per-array 'next expected sequential IO' sector
+ * number - if this matches on the next IO then we use the last disk.
+ * There is also a per-disk 'last know head position' sector that is
+ * maintained from IRQ contexts, both the normal and the resync IO
+ * completion handlers update this position correctly. If there is no
+ * perfect sequential match then we pick the disk whose head is closest.
+ *
+ * If there are 2 mirrors in the same 2 devices, performance degrades
+ * because position is mirror, not device based.
+ *
+ * The rdev for the device selected will have nr_pending incremented.
+ */
+static int read_balance(conf_t *conf, r1bio_t *r1_bio)
+{
+        const unsigned long this_sector = r1_bio->sector;
+        int new_disk = conf->last_used, disk = new_disk;
+        const int sectors = r1_bio->sectors;
+        sector_t new_distance, current_distance;
+        mdk_rdev_t *new_rdev, *rdev;
+        rcu_read_lock();
+        /*
+         * Check if it if we can balance. We can balance on the whole
+         * device if no resync is going on, or below the resync window.
+         * We take the first readable disk when above the resync window.
+         */
+ retry:
+        if (conf->mddev->recovery_cp < MaxSector &&
+            (this_sector + sectors >= conf->next_resync)) {
+                /* Choose the first operation device, for consistancy */
+                new_disk = 0;
+                while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
+                       !new_rdev->in_sync) {
+                        new_disk++;
+                        if (new_disk == conf->raid_disks) {
+                                new_disk = -1;
+                                break;
+                        }
+                }
+                goto rb_out;
+        }
+        /* make sure the disk is operational */
+        while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
+               !new_rdev->in_sync) {
+                if (new_disk <= 0)
+                        new_disk = conf->raid_disks;
+                new_disk--;
+                if (new_disk == disk) {
+                        new_disk = -1;
+                        goto rb_out;
+                }
+        }
+        disk = new_disk;
+        /* now disk == new_disk == starting point for search */
+        /*
+         * Don't change to another disk for sequential reads:
+         */
+        if (conf->next_seq_sect == this_sector)
+                goto rb_out;
+        if (this_sector == conf->mirrors[new_disk].head_position)
+                goto rb_out;
+        current_distance = abs(this_sector - conf->mirrors[disk].head_position);
+        /* Find the disk whose head is closest */
+        do {
+                if (disk <= 0)
+                        disk = conf->raid_disks;
+                disk--;
+                if ((rdev=conf->mirrors[disk].rdev) == NULL ||
+                    !rdev->in_sync)
+                        continue;
+                if (!atomic_read(&rdev->nr_pending)) {
+                        new_disk = disk;
+                        new_rdev = rdev;
+                        break;
+                }
+                new_distance = abs(this_sector - conf->mirrors[disk].head_position);
+                if (new_distance < current_distance) {
+                        current_distance = new_distance;
+                        new_disk = disk;
+                        new_rdev = rdev;
+                }
+        } while (disk != conf->last_used);
+rb_out:
+        if (new_disk >= 0) {
+                conf->next_seq_sect = this_sector + sectors;
+                conf->last_used = new_disk;
+                atomic_inc(&new_rdev->nr_pending);
+                if (!new_rdev->in_sync) {
+                        /* cannot risk returning a device that failed
+                         * before we inc'ed nr_pending
+                         */
+                        atomic_dec(&new_rdev->nr_pending);
+                        goto retry;
+                }
+        }
+        rcu_read_unlock();
+        return new_disk;
+}
+static void unplug_slaves(mddev_t *mddev)
+{
+        conf_t *conf = mddev_to_conf(mddev);
+        int i;
+        rcu_read_lock();
+        for (i=0; i<mddev->raid_disks; i++) {
+                mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+                if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
+                        request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+                        atomic_inc(&rdev->nr_pending);
+                        rcu_read_unlock();
+                        if (r_queue->unplug_fn)
+                                r_queue->unplug_fn(r_queue);
+                        rdev_dec_pending(rdev, mddev);
+                        rcu_read_lock();
+                }
+        }
+        rcu_read_unlock();
+}
+static void raid1_unplug(request_queue_t *q)
+{
+        unplug_slaves(q->queuedata);
+}
+static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
+                             sector_t *error_sector)
+{
+        mddev_t *mddev = q->queuedata;
+        conf_t *conf = mddev_to_conf(mddev);
+        int i, ret = 0;
+        rcu_read_lock();
+        for (i=0; i<mddev->raid_disks && ret == 0; i++) {
+                mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+                if (rdev && !rdev->faulty) {
+                        struct block_device *bdev = rdev->bdev;
+                        request_queue_t *r_queue = bdev_get_queue(bdev);
+                        if (!r_queue->issue_flush_fn)
+                                ret = -EOPNOTSUPP;
+                        else {
+                                atomic_inc(&rdev->nr_pending);
+                                rcu_read_unlock();
+                                ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
+                                                              error_sector);
+                                rdev_dec_pending(rdev, mddev);
+                                rcu_read_lock();
+                        }
+                }
+        }
+        rcu_read_unlock();
+        return ret;
+}
+/*
+ * Throttle resync depth, so that we can both get proper overlapping of
+ * requests, but are still able to handle normal requests quickly.
+ */
+#define RESYNC_DEPTH 32
+static void device_barrier(conf_t *conf, sector_t sect)
+{
+        spin_lock_irq(&conf->resync_lock);
+        wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
+                            conf->resync_lock, unplug_slaves(conf->mddev));
+        
+        if (!conf->barrier++) {
+                wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
+                                    conf->resync_lock, unplug_slaves(conf->mddev));
+                if (conf->nr_pending)
+                        BUG();
+        }
+        wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
+                            conf->resync_lock, unplug_slaves(conf->mddev));
+        conf->next_resync = sect;
+        spin_unlock_irq(&conf->resync_lock);
+}
+static int make_request(request_queue_t *q, struct bio * bio)
+{
+        mddev_t *mddev = q->queuedata;
+        conf_t *conf = mddev_to_conf(mddev);
+        mirror_info_t *mirror;
+        r1bio_t *r1_bio;
+        struct bio *read_bio;
+        int i, disks;
+        mdk_rdev_t *rdev;
+        /*
+         * Register the new request and wait if the reconstruction
+         * thread has put up a bar for new requests.
+         * Continue immediately if no resync is active currently.
+         */
+        spin_lock_irq(&conf->resync_lock);
+        wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
+        conf->nr_pending++;
+        spin_unlock_irq(&conf->resync_lock);
+        if (bio_data_dir(bio)==WRITE) {
+                disk_stat_inc(mddev->gendisk, writes);
+                disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
+        } else {
+                disk_stat_inc(mddev->gendisk, reads);
+                disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
+        }
+        /*
+         * make_request() can abort the operation when READA is being
+         * used and no empty request is available.
+         *
+         */
+        r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+        r1_bio->master_bio = bio;
+        r1_bio->sectors = bio->bi_size >> 9;
+        r1_bio->mddev = mddev;
+        r1_bio->sector = bio->bi_sector;
+        r1_bio->state = 0;
+        if (bio_data_dir(bio) == READ) {
+                /*
+                 * read balancing logic:
+                 */
+                int rdisk = read_balance(conf, r1_bio);
+                if (rdisk < 0) {
+                        /* couldn't find anywhere to read from */
+                        raid_end_bio_io(r1_bio);
+                        return 0;
+                }
+                mirror = conf->mirrors + rdisk;
+                r1_bio->read_disk = rdisk;
+                read_bio = bio_clone(bio, GFP_NOIO);
+                r1_bio->bios[rdisk] = read_bio;
+                read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
+                read_bio->bi_bdev = mirror->rdev->bdev;
+                read_bio->bi_end_io = raid1_end_read_request;
+                read_bio->bi_rw = READ;
+                read_bio->bi_private = r1_bio;
+                generic_make_request(read_bio);
+                return 0;
+        }
+        /*
+         * WRITE:
+         */
+        /* first select target devices under spinlock and
+         * inc refcount on their rdev.  Record them by setting
+         * bios[x] to bio
+         */
+        disks = conf->raid_disks;
+        rcu_read_lock();
+        for (i = 0;  i < disks; i++) {
+                if ((rdev=conf->mirrors[i].rdev) != NULL &&
+                    !rdev->faulty) {
+                        atomic_inc(&rdev->nr_pending);
+                        if (rdev->faulty) {
+                                atomic_dec(&rdev->nr_pending);
+                                r1_bio->bios[i] = NULL;
+                        } else
+                                r1_bio->bios[i] = bio;
+                } else
+                        r1_bio->bios[i] = NULL;
+        }
+        rcu_read_unlock();
+        atomic_set(&r1_bio->remaining, 1);
+        md_write_start(mddev);
+        for (i = 0; i < disks; i++) {
+                struct bio *mbio;
+                if (!r1_bio->bios[i])
+                        continue;
+                mbio = bio_clone(bio, GFP_NOIO);
+                r1_bio->bios[i] = mbio;
+                mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
+                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+                mbio->bi_end_io = raid1_end_write_request;
+                mbio->bi_rw = WRITE;
+                mbio->bi_private = r1_bio;
+                atomic_inc(&r1_bio->remaining);
+                generic_make_request(mbio);
+        }
+        if (atomic_dec_and_test(&r1_bio->remaining)) {
+                md_write_end(mddev);
+                raid_end_bio_io(r1_bio);
+        }
+        return 0;
+}
+static void status(struct seq_file *seq, mddev_t *mddev)
+{
+        conf_t *conf = mddev_to_conf(mddev);
+        int i;
+        seq_printf(seq, " [%d/%d] [", conf->raid_disks,
+                                                conf->working_disks);
+        for (i = 0; i < conf->raid_disks; i++)
+                seq_printf(seq, "%s",
+                              conf->mirrors[i].rdev &&
+                              conf->mirrors[i].rdev->in_sync ? "U" : "_");
+        seq_printf(seq, "]");
+}
+static void error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        char b[BDEVNAME_SIZE];
+        conf_t *conf = mddev_to_conf(mddev);
+        /*
+         * If it is not operational, then we have already marked it as dead
+         * else if it is the last working disks, ignore the error, let the
+         * next level up know.
+         * else mark the drive as failed
+         */
+        if (rdev->in_sync
+            && conf->working_disks == 1)
+                /*
+                 * Don't fail the drive, act as though we were just a
+                 * normal single drive
+                 */
+                return;
+        if (rdev->in_sync) {
+                mddev->degraded++;
+                conf->working_disks--;
+                /*
+                 * if recovery is running, make sure it aborts.
+                 */
+                set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+        }
+        rdev->in_sync = 0;
+        rdev->faulty = 1;
+        mddev->sb_dirty = 1;
+        printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
+                "       Operation continuing on %d devices\n",
+                bdevname(rdev->bdev,b), conf->working_disks);
+}
+static void print_conf(conf_t *conf)
+{
+        int i;
+        mirror_info_t *tmp;
+        printk("RAID1 conf printout:\n");
+        if (!conf) {
+                printk("(!conf)\n");
+                return;
+        }
+        printk(" --- wd:%d rd:%d\n", conf->working_disks,
+                conf->raid_disks);
+        for (i = 0; i < conf->raid_disks; i++) {
+                char b[BDEVNAME_SIZE];
+                tmp = conf->mirrors + i;
+                if (tmp->rdev)
+                        printk(" disk %d, wo:%d, o:%d, dev:%s\n",
+                                i, !tmp->rdev->in_sync, !tmp->rdev->faulty,
+                                bdevname(tmp->rdev->bdev,b));
+        }
+}
+static void close_sync(conf_t *conf)
+{
+        spin_lock_irq(&conf->resync_lock);
+        wait_event_lock_irq(conf->wait_resume, !conf->barrier,
+                            conf->resync_lock,  unplug_slaves(conf->mddev));
+        spin_unlock_irq(&conf->resync_lock);
+        if (conf->barrier) BUG();
+        if (waitqueue_active(&conf->wait_idle)) BUG();
+        mempool_destroy(conf->r1buf_pool);
+        conf->r1buf_pool = NULL;
+}
+static int raid1_spare_active(mddev_t *mddev)
+{
+        int i;
+        conf_t *conf = mddev->private;
+        mirror_info_t *tmp;
+        /*
+         * Find all failed disks within the RAID1 configuration 
+         * and mark them readable
+         */
+        for (i = 0; i < conf->raid_disks; i++) {
+                tmp = conf->mirrors + i;
+                if (tmp->rdev 
+                    && !tmp->rdev->faulty
+                    && !tmp->rdev->in_sync) {
+                        conf->working_disks++;
+                        mddev->degraded--;
+                        tmp->rdev->in_sync = 1;
+                }
+        }
+        print_conf(conf);
+        return 0;
+}
+static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        conf_t *conf = mddev->private;
+        int found = 0;
+        int mirror;
+        mirror_info_t *p;
+        for (mirror=0; mirror < mddev->raid_disks; mirror++)
+                if ( !(p=conf->mirrors+mirror)->rdev) {
+                        blk_queue_stack_limits(mddev->queue,
+                                               rdev->bdev->bd_disk->queue);
+                        /* as we don't honour merge_bvec_fn, we must never risk
+                         * violating it, so limit ->max_sector to one PAGE, as
+                         * a one page request is never in violation.
+                         */
+                        if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+                            mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                                blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+                        p->head_position = 0;
+                        rdev->raid_disk = mirror;
+                        found = 1;
+                        p->rdev = rdev;
+                        break;
+                }
+        print_conf(conf);
+        return found;
+}
+static int raid1_remove_disk(mddev_t *mddev, int number)
+{
+        conf_t *conf = mddev->private;
+        int err = 0;
+        mdk_rdev_t *rdev;
+        mirror_info_t *p = conf->mirrors+ number;
+        print_conf(conf);
+        rdev = p->rdev;
+        if (rdev) {
+                if (rdev->in_sync ||
+                    atomic_read(&rdev->nr_pending)) {
+                        err = -EBUSY;
+                        goto abort;
+                }
+                p->rdev = NULL;
+                synchronize_kernel();
+                if (atomic_read(&rdev->nr_pending)) {
+                        /* lost the race, try later */
+                        err = -EBUSY;
+                        p->rdev = rdev;
+                }
+        }
+abort:
+        print_conf(conf);
+        return err;
+}
+static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
+{
+        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+        conf_t *conf = mddev_to_conf(r1_bio->mddev);
+        if (bio->bi_size)
+                return 1;
+        if (r1_bio->bios[r1_bio->read_disk] != bio)
+                BUG();
+        update_head_pos(r1_bio->read_disk, r1_bio);
+        /*
+         * we have read a block, now it needs to be re-written,
+         * or re-read if the read failed.
+         * We don't do much here, just schedule handling by raid1d
+         */
+        if (!uptodate)
+                md_error(r1_bio->mddev,
+                         conf->mirrors[r1_bio->read_disk].rdev);
+        else
+                set_bit(R1BIO_Uptodate, &r1_bio->state);
+        rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
+        reschedule_retry(r1_bio);
+        return 0;
+}
+static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
+{
+        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+        mddev_t *mddev = r1_bio->mddev;
+        conf_t *conf = mddev_to_conf(mddev);
+        int i;
+        int mirror=0;
+        if (bio->bi_size)
+                return 1;
+        for (i = 0; i < conf->raid_disks; i++)
+                if (r1_bio->bios[i] == bio) {
+                        mirror = i;
+                        break;
+                }
+        if (!uptodate)
+                md_error(mddev, conf->mirrors[mirror].rdev);
+        update_head_pos(mirror, r1_bio);
+        if (atomic_dec_and_test(&r1_bio->remaining)) {
+                md_done_sync(mddev, r1_bio->sectors, uptodate);
+                put_buf(r1_bio);
+        }
+        rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
+        return 0;
+}
+static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
+{
+        conf_t *conf = mddev_to_conf(mddev);
+        int i;
+        int disks = conf->raid_disks;
+        struct bio *bio, *wbio;
+        bio = r1_bio->bios[r1_bio->read_disk];
+        /*
+         * schedule writes
+         */
+        if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+                /*
+                 * There is no point trying a read-for-reconstruct as
+                 * reconstruct is about to be aborted
+                 */
+                char b[BDEVNAME_SIZE];
+                printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
+                        " for block %llu\n",
+                        bdevname(bio->bi_bdev,b), 
+                        (unsigned long long)r1_bio->sector);
+                md_done_sync(mddev, r1_bio->sectors, 0);
+                put_buf(r1_bio);
+                return;
+        }
+        atomic_set(&r1_bio->remaining, 1);
+        for (i = 0; i < disks ; i++) {
+                wbio = r1_bio->bios[i];
+                if (wbio->bi_end_io != end_sync_write)
+                        continue;
+                atomic_inc(&conf->mirrors[i].rdev->nr_pending);
+                atomic_inc(&r1_bio->remaining);
+                md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
+                generic_make_request(wbio);
+        }
+        if (atomic_dec_and_test(&r1_bio->remaining)) {
+                md_done_sync(mddev, r1_bio->sectors, 1);
+                put_buf(r1_bio);
+        }
+}
+/*
+ * This is a kernel thread which:
+ *
+ *      1.      Retries failed read operations on working mirrors.
+ *      2.      Updates the raid superblock when problems encounter.
+ *      3.      Performs writes following reads for array syncronising.
+ */
+static void raid1d(mddev_t *mddev)
+{
+        r1bio_t *r1_bio;
+        struct bio *bio;
+        unsigned long flags;
+        conf_t *conf = mddev_to_conf(mddev);
+        struct list_head *head = &conf->retry_list;
+        int unplug=0;
+        mdk_rdev_t *rdev;
+        md_check_recovery(mddev);
+        md_handle_safemode(mddev);
+        
+        for (;;) {
+                char b[BDEVNAME_SIZE];
+                spin_lock_irqsave(&conf->device_lock, flags);
+                if (list_empty(head))
+                        break;
+                r1_bio = list_entry(head->prev, r1bio_t, retry_list);
+                list_del(head->prev);
+                spin_unlock_irqrestore(&conf->device_lock, flags);
+                mddev = r1_bio->mddev;
+                conf = mddev_to_conf(mddev);
+                if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
+                        sync_request_write(mddev, r1_bio);
+                        unplug = 1;
+                } else {
+                        int disk;
+                        bio = r1_bio->bios[r1_bio->read_disk];
+                        if ((disk=read_balance(conf, r1_bio)) == -1) {
+                                printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
+                                       " read error for block %llu\n",
+                                       bdevname(bio->bi_bdev,b),
+                                       (unsigned long long)r1_bio->sector);
+                                raid_end_bio_io(r1_bio);
+                        } else {
+                                r1_bio->bios[r1_bio->read_disk] = NULL;
+                                r1_bio->read_disk = disk;
+                                bio_put(bio);
+                                bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
+                                r1_bio->bios[r1_bio->read_disk] = bio;
+                                rdev = conf->mirrors[disk].rdev;
+                                if (printk_ratelimit())
+                                        printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
+                                               " another mirror\n",
+                                               bdevname(rdev->bdev,b),
+                                               (unsigned long long)r1_bio->sector);
+                                bio->bi_sector = r1_bio->sector + rdev->data_offset;
+                                bio->bi_bdev = rdev->bdev;
+                                bio->bi_end_io = raid1_end_read_request;
+                                bio->bi_rw = READ;
+                                bio->bi_private = r1_bio;
+                                unplug = 1;
+                                generic_make_request(bio);
+                        }
+                }
+        }
+        spin_unlock_irqrestore(&conf->device_lock, flags);
+        if (unplug)
+                unplug_slaves(mddev);
+}
+static int init_resync(conf_t *conf)
+{
+        int buffs;
+        buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
+        if (conf->r1buf_pool)
+                BUG();
+        conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
+                                          conf->poolinfo);
+        if (!conf->r1buf_pool)
+                return -ENOMEM;
+        conf->next_resync = 0;
+        return 0;
+}
+/*
+ * perform a "sync" on one "block"
+ *
+ * We need to make sure that no normal I/O request - particularly write
+ * requests - conflict with active sync requests.
+ *
+ * This is achieved by tracking pending requests and a 'barrier' concept
+ * that can be installed to exclude normal IO requests.
+ */
+static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
+{
+        conf_t *conf = mddev_to_conf(mddev);
+        mirror_info_t *mirror;
+        r1bio_t *r1_bio;
+        struct bio *bio;
+        sector_t max_sector, nr_sectors;
+        int disk;
+        int i;
+        int write_targets = 0;
+        if (!conf->r1buf_pool)
+                if (init_resync(conf))
+                        return -ENOMEM;
+        max_sector = mddev->size << 1;
+        if (sector_nr >= max_sector) {
+                close_sync(conf);
+                return 0;
+        }
+        /*
+         * If there is non-resync activity waiting for us then
+         * put in a delay to throttle resync.
+         */
+        if (!go_faster && waitqueue_active(&conf->wait_resume))
+                msleep_interruptible(1000);
+        device_barrier(conf, sector_nr + RESYNC_SECTORS);
+        /*
+         * If reconstructing, and >1 working disc,
+         * could dedicate one to rebuild and others to
+         * service read requests ..
+         */
+        disk = conf->last_used;
+        /* make sure disk is operational */
+        while (conf->mirrors[disk].rdev == NULL ||
+               !conf->mirrors[disk].rdev->in_sync) {
+                if (disk <= 0)
+                        disk = conf->raid_disks;
+                disk--;
+                if (disk == conf->last_used)
+                        break;
+        }
+        conf->last_used = disk;
+        atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
+        mirror = conf->mirrors + disk;
+        r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
+        spin_lock_irq(&conf->resync_lock);
+        conf->nr_pending++;
+        spin_unlock_irq(&conf->resync_lock);
+        r1_bio->mddev = mddev;
+        r1_bio->sector = sector_nr;
+        set_bit(R1BIO_IsSync, &r1_bio->state);
+        r1_bio->read_disk = disk;
+        for (i=0; i < conf->raid_disks; i++) {
+                bio = r1_bio->bios[i];
+                /* take from bio_init */
+                bio->bi_next = NULL;
+                bio->bi_flags |= 1 << BIO_UPTODATE;
+                bio->bi_rw = 0;
+                bio->bi_vcnt = 0;
+                bio->bi_idx = 0;
+                bio->bi_phys_segments = 0;
+                bio->bi_hw_segments = 0;
+                bio->bi_size = 0;
+                bio->bi_end_io = NULL;
+                bio->bi_private = NULL;
+                if (i == disk) {
+                        bio->bi_rw = READ;
+                        bio->bi_end_io = end_sync_read;
+                } else if (conf->mirrors[i].rdev &&
+                           !conf->mirrors[i].rdev->faulty &&
+                           (!conf->mirrors[i].rdev->in_sync ||
+                            sector_nr + RESYNC_SECTORS > mddev->recovery_cp)) {
+                        bio->bi_rw = WRITE;
+                        bio->bi_end_io = end_sync_write;
+                        write_targets ++;
+                } else
+                        continue;
+                bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset;
+                bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+                bio->bi_private = r1_bio;
+        }
+        if (write_targets == 0) {
+                /* There is nowhere to write, so all non-sync
+                 * drives must be failed - so we are finished
+                 */
+                int rv = max_sector - sector_nr;
+                md_done_sync(mddev, rv, 1);
+                put_buf(r1_bio);
+                rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
+                return rv;
+        }
+        nr_sectors = 0;
+        do {
+                struct page *page;
+                int len = PAGE_SIZE;
+                if (sector_nr + (len>>9) > max_sector)
+                        len = (max_sector - sector_nr) << 9;
+                if (len == 0)
+                        break;
+                for (i=0 ; i < conf->raid_disks; i++) {
+                        bio = r1_bio->bios[i];
+                        if (bio->bi_end_io) {
+                                page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page;
+                                if (bio_add_page(bio, page, len, 0) == 0) {
+                                        /* stop here */
+                                        r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page;
+                                        while (i > 0) {
+                                                i--;
+                                                bio = r1_bio->bios[i];
+                                                if (bio->bi_end_io==NULL) continue;
+                                                /* remove last page from this bio */
+                                                bio->bi_vcnt--;
+                                                bio->bi_size -= len;
+                                                bio->bi_flags &= ~(1<< BIO_SEG_VALID);
+                                        }
+                                        goto bio_full;
+                                }
+                        }
+                }
+                nr_sectors += len>>9;
+                sector_nr += len>>9;
+        } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
+ bio_full:
+        bio = r1_bio->bios[disk];
+        r1_bio->sectors = nr_sectors;
+        md_sync_acct(mirror->rdev->bdev, nr_sectors);
+        generic_make_request(bio);
+        return nr_sectors;
+}
+static int run(mddev_t *mddev)
+{
+        conf_t *conf;
+        int i, j, disk_idx;
+        mirror_info_t *disk;
+        mdk_rdev_t *rdev;
+        struct list_head *tmp;
+        if (mddev->level != 1) {
+                printk("raid1: %s: raid level not set to mirroring (%d)\n",
+                       mdname(mddev), mddev->level);
+                goto out;
+        }
+        /*
+         * copy the already verified devices into our private RAID1
+         * bookkeeping area. [whatever we allocate in run(),
+         * should be freed in stop()]
+         */
+        conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
+        mddev->private = conf;
+        if (!conf)
+                goto out_no_mem;
+        memset(conf, 0, sizeof(*conf));
+        conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, 
+                                 GFP_KERNEL);
+        if (!conf->mirrors)
+                goto out_no_mem;
+        memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
+        conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
+        if (!conf->poolinfo)
+                goto out_no_mem;
+        conf->poolinfo->mddev = mddev;
+        conf->poolinfo->raid_disks = mddev->raid_disks;
+        conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
+                                          r1bio_pool_free,
+                                          conf->poolinfo);
+        if (!conf->r1bio_pool)
+                goto out_no_mem;
+        mddev->queue->unplug_fn = raid1_unplug;
+        mddev->queue->issue_flush_fn = raid1_issue_flush;
+        ITERATE_RDEV(mddev, rdev, tmp) {
+                disk_idx = rdev->raid_disk;
+                if (disk_idx >= mddev->raid_disks
+                    || disk_idx < 0)
+                        continue;
+                disk = conf->mirrors + disk_idx;
+                disk->rdev = rdev;
+                blk_queue_stack_limits(mddev->queue,
+                                       rdev->bdev->bd_disk->queue);
+                /* as we don't honour merge_bvec_fn, we must never risk
+                 * violating it, so limit ->max_sector to one PAGE, as
+                 * a one page request is never in violation.
+                 */
+                if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+                disk->head_position = 0;
+                if (!rdev->faulty && rdev->in_sync)
+                        conf->working_disks++;
+        }
+        conf->raid_disks = mddev->raid_disks;
+        conf->mddev = mddev;
+        spin_lock_init(&conf->device_lock);
+        INIT_LIST_HEAD(&conf->retry_list);
+        if (conf->working_disks == 1)
+                mddev->recovery_cp = MaxSector;
+        spin_lock_init(&conf->resync_lock);
+        init_waitqueue_head(&conf->wait_idle);
+        init_waitqueue_head(&conf->wait_resume);
+        if (!conf->working_disks) {
+                printk(KERN_ERR "raid1: no operational mirrors for %s\n",
+                        mdname(mddev));
+                goto out_free_conf;
+        }
+        mddev->degraded = 0;
+        for (i = 0; i < conf->raid_disks; i++) {
+                disk = conf->mirrors + i;
+                if (!disk->rdev) {
+                        disk->head_position = 0;
+                        mddev->degraded++;
+                }
+        }
+        /*
+         * find the first working one and use it as a starting point
+         * to read balancing.
+         */
+        for (j = 0; j < conf->raid_disks &&
+                     (!conf->mirrors[j].rdev ||
+                      !conf->mirrors[j].rdev->in_sync) ; j++)
+                /* nothing */;
+        conf->last_used = j;
+        {
+                mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");
+                if (!mddev->thread) {
+                        printk(KERN_ERR 
+                                "raid1: couldn't allocate thread for %s\n", 
+                                mdname(mddev));
+                        goto out_free_conf;
+                }
+        }
+        printk(KERN_INFO 
+                "raid1: raid set %s active with %d out of %d mirrors\n",
+                mdname(mddev), mddev->raid_disks - mddev->degraded, 
+                mddev->raid_disks);
+        /*
+         * Ok, everything is just fine now
+         */
+        mddev->array_size = mddev->size;
+        return 0;
+out_no_mem:
+        printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
+               mdname(mddev));
+out_free_conf:
+        if (conf) {
+                if (conf->r1bio_pool)
+                        mempool_destroy(conf->r1bio_pool);
+                if (conf->mirrors)
+                        kfree(conf->mirrors);
+                if (conf->poolinfo)
+                        kfree(conf->poolinfo);
+                kfree(conf);
+                mddev->private = NULL;
+        }
+out:
+        return -EIO;
+}
+static int stop(mddev_t *mddev)
+{
+        conf_t *conf = mddev_to_conf(mddev);
+        md_unregister_thread(mddev->thread);
+        mddev->thread = NULL;
+        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+        if (conf->r1bio_pool)
+                mempool_destroy(conf->r1bio_pool);
+        if (conf->mirrors)
+                kfree(conf->mirrors);
+        if (conf->poolinfo)
+                kfree(conf->poolinfo);
+        kfree(conf);
+        mddev->private = NULL;
+        return 0;
+}
+static int raid1_resize(mddev_t *mddev, sector_t sectors)
+{
+        /* no resync is happening, and there is enough space
+         * on all devices, so we can resize.
+         * We need to make sure resync covers any new space.
+         * If the array is shrinking we should possibly wait until
+         * any io in the removed space completes, but it hardly seems
+         * worth it.
+         */
+        mddev->array_size = sectors>>1;
+        set_capacity(mddev->gendisk, mddev->array_size << 1);
+        mddev->changed = 1;
+        if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
+                mddev->recovery_cp = mddev->size << 1;
+                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+        }
+        mddev->size = mddev->array_size;
+        return 0;
+}
+static int raid1_reshape(mddev_t *mddev, int raid_disks)
+{
+        /* We need to:
+         * 1/ resize the r1bio_pool
+         * 2/ resize conf->mirrors
+         *
+         * We allocate a new r1bio_pool if we can.
+         * Then raise a device barrier and wait until all IO stops.
+         * Then resize conf->mirrors and swap in the new r1bio pool.
+         */
+        mempool_t *newpool, *oldpool;
+        struct pool_info *newpoolinfo;
+        mirror_info_t *newmirrors;
+        conf_t *conf = mddev_to_conf(mddev);
+        int d;
+        for (d= raid_disks; d < conf->raid_disks; d++)
+                if (conf->mirrors[d].rdev)
+                        return -EBUSY;
+        newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
+        if (!newpoolinfo)
+                return -ENOMEM;
+        newpoolinfo->mddev = mddev;
+        newpoolinfo->raid_disks = raid_disks;
+        newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
+                                 r1bio_pool_free, newpoolinfo);
+        if (!newpool) {
+                kfree(newpoolinfo);
+                return -ENOMEM;
+        }
+        newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
+        if (!newmirrors) {
+                kfree(newpoolinfo);
+                mempool_destroy(newpool);
+                return -ENOMEM;
+        }
+        memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks);
+        spin_lock_irq(&conf->resync_lock);
+        conf->barrier++;
+        wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
+                            conf->resync_lock, unplug_slaves(mddev));
+        spin_unlock_irq(&conf->resync_lock);
+        /* ok, everything is stopped */
+        oldpool = conf->r1bio_pool;
+        conf->r1bio_pool = newpool;
+        for (d=0; d < raid_disks && d < conf->raid_disks; d++)
+                newmirrors[d] = conf->mirrors[d];
+        kfree(conf->mirrors);
+        conf->mirrors = newmirrors;
+        kfree(conf->poolinfo);
+        conf->poolinfo = newpoolinfo;
+        mddev->degraded += (raid_disks - conf->raid_disks);
+        conf->raid_disks = mddev->raid_disks = raid_disks;
+        spin_lock_irq(&conf->resync_lock);
+        conf->barrier--;
+        spin_unlock_irq(&conf->resync_lock);
+        wake_up(&conf->wait_resume);
+        wake_up(&conf->wait_idle);
+        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+        md_wakeup_thread(mddev->thread);
+        mempool_destroy(oldpool);
+        return 0;
+}
+static mdk_personality_t raid1_personality =
+{
+        .name           = "raid1",
+        .owner          = THIS_MODULE,
+        .make_request   = make_request,
+        .run            = run,
+        .stop           = stop,
+        .status         = status,
+        .error_handler  = error,
+        .hot_add_disk   = raid1_add_disk,
+        .hot_remove_disk= raid1_remove_disk,
+        .spare_active   = raid1_spare_active,
+        .sync_request   = sync_request,
+        .resize         = raid1_resize,
+        .reshape        = raid1_reshape,
+};
+static int __init raid_init(void)
+{
+        return register_md_personality(RAID1, &raid1_personality);
+}
+static void raid_exit(void)
+{
+        unregister_md_personality(RAID1);
+}
+module_init(raid_init);
+module_exit(raid_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-3"); /* RAID1 */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
new file mode 100644
index 000000000000..b100bfe4fdca
--- /dev/null
+++ b/drivers/md/raid10.c
@@ -0,0 +1,1787 @@
+/*
+ * raid10.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 2000-2004 Neil Brown
+ *
+ * RAID-10 support for md.
+ *
+ * Base on code in raid1.c.  See raid1.c for futher copyright information.
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/raid/raid10.h>
+/*
+ * RAID10 provides a combination of RAID0 and RAID1 functionality.
+ * The layout of data is defined by
+ *    chunk_size
+ *    raid_disks
+ *    near_copies (stored in low byte of layout)
+ *    far_copies (stored in second byte of layout)
+ *
+ * The data to be stored is divided into chunks using chunksize.
+ * Each device is divided into far_copies sections.
+ * In each section, chunks are laid out in a style similar to raid0, but
+ * near_copies copies of each chunk is stored (each on a different drive).
+ * The starting device for each section is offset near_copies from the starting
+ * device of the previous section.
+ * Thus there are (near_copies*far_copies) of each chunk, and each is on a different
+ * drive.
+ * near_copies and far_copies must be at least one, and their product is at most
+ * raid_disks.
+ */
+/*
+ * Number of guaranteed r10bios in case of extreme VM load:
+ */
+#define NR_RAID10_BIOS 256
+static void unplug_slaves(mddev_t *mddev);
+static void * r10bio_pool_alloc(unsigned int __nocast gfp_flags, void *data)
+{
+        conf_t *conf = data;
+        r10bio_t *r10_bio;
+        int size = offsetof(struct r10bio_s, devs[conf->copies]);
+        /* allocate a r10bio with room for raid_disks entries in the bios array */
+        r10_bio = kmalloc(size, gfp_flags);
+        if (r10_bio)
+                memset(r10_bio, 0, size);
+        else
+                unplug_slaves(conf->mddev);
+        return r10_bio;
+}
+static void r10bio_pool_free(void *r10_bio, void *data)
+{
+        kfree(r10_bio);
+}
+#define RESYNC_BLOCK_SIZE (64*1024)
+//#define RESYNC_BLOCK_SIZE PAGE_SIZE
+#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+#define RESYNC_WINDOW (2048*1024)
+/*
+ * When performing a resync, we need to read and compare, so
+ * we need as many pages are there are copies.
+ * When performing a recovery, we need 2 bios, one for read,
+ * one for write (we recover only one drive per r10buf)
+ *
+ */
+static void * r10buf_pool_alloc(unsigned int __nocast gfp_flags, void *data)
+{
+        conf_t *conf = data;
+        struct page *page;
+        r10bio_t *r10_bio;
+        struct bio *bio;
+        int i, j;
+        int nalloc;
+        r10_bio = r10bio_pool_alloc(gfp_flags, conf);
+        if (!r10_bio) {
+                unplug_slaves(conf->mddev);
+                return NULL;
+        }
+        if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
+                nalloc = conf->copies; /* resync */
+        else
+                nalloc = 2; /* recovery */
+        /*
+         * Allocate bios.
+         */
+        for (j = nalloc ; j-- ; ) {
+                bio = bio_alloc(gfp_flags, RESYNC_PAGES);
+                if (!bio)
+                        goto out_free_bio;
+                r10_bio->devs[j].bio = bio;
+        }
+        /*
+         * Allocate RESYNC_PAGES data pages and attach them
+         * where needed.
+         */
+        for (j = 0 ; j < nalloc; j++) {
+                bio = r10_bio->devs[j].bio;
+                for (i = 0; i < RESYNC_PAGES; i++) {
+                        page = alloc_page(gfp_flags);
+                        if (unlikely(!page))
+                                goto out_free_pages;
+                        bio->bi_io_vec[i].bv_page = page;
+                }
+        }
+        return r10_bio;
+out_free_pages:
+        for ( ; i > 0 ; i--)
+                __free_page(bio->bi_io_vec[i-1].bv_page);
+        while (j--)
+                for (i = 0; i < RESYNC_PAGES ; i++)
+                        __free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+        j = -1;
+out_free_bio:
+        while ( ++j < nalloc )
+                bio_put(r10_bio->devs[j].bio);
+        r10bio_pool_free(r10_bio, conf);
+        return NULL;
+}
+static void r10buf_pool_free(void *__r10_bio, void *data)
+{
+        int i;
+        conf_t *conf = data;
+        r10bio_t *r10bio = __r10_bio;
+        int j;
+        for (j=0; j < conf->copies; j++) {
+                struct bio *bio = r10bio->devs[j].bio;
+                if (bio) {
+                        for (i = 0; i < RESYNC_PAGES; i++) {
+                                __free_page(bio->bi_io_vec[i].bv_page);
+                                bio->bi_io_vec[i].bv_page = NULL;
+                        }
+                        bio_put(bio);
+                }
+        }
+        r10bio_pool_free(r10bio, conf);
+}
+static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
+{
+        int i;
+        for (i = 0; i < conf->copies; i++) {
+                struct bio **bio = & r10_bio->devs[i].bio;
+                if (*bio)
+                        bio_put(*bio);
+                *bio = NULL;
+        }
+}
+static inline void free_r10bio(r10bio_t *r10_bio)
+{
+        unsigned long flags;
+        conf_t *conf = mddev_to_conf(r10_bio->mddev);
+        /*
+         * Wake up any possible resync thread that waits for the device
+         * to go idle.
+         */
+        spin_lock_irqsave(&conf->resync_lock, flags);
+        if (!--conf->nr_pending) {
+                wake_up(&conf->wait_idle);
+                wake_up(&conf->wait_resume);
+        }
+        spin_unlock_irqrestore(&conf->resync_lock, flags);
+        put_all_bios(conf, r10_bio);
+        mempool_free(r10_bio, conf->r10bio_pool);
+}
+static inline void put_buf(r10bio_t *r10_bio)
+{
+        conf_t *conf = mddev_to_conf(r10_bio->mddev);
+        unsigned long flags;
+        mempool_free(r10_bio, conf->r10buf_pool);
+        spin_lock_irqsave(&conf->resync_lock, flags);
+        if (!conf->barrier)
+                BUG();
+        --conf->barrier;
+        wake_up(&conf->wait_resume);
+        wake_up(&conf->wait_idle);
+        if (!--conf->nr_pending) {
+                wake_up(&conf->wait_idle);
+                wake_up(&conf->wait_resume);
+        }
+        spin_unlock_irqrestore(&conf->resync_lock, flags);
+}
+static void reschedule_retry(r10bio_t *r10_bio)
+{
+        unsigned long flags;
+        mddev_t *mddev = r10_bio->mddev;
+        conf_t *conf = mddev_to_conf(mddev);
+        spin_lock_irqsave(&conf->device_lock, flags);
+        list_add(&r10_bio->retry_list, &conf->retry_list);
+        spin_unlock_irqrestore(&conf->device_lock, flags);
+        md_wakeup_thread(mddev->thread);
+}
+/*
+ * raid_end_bio_io() is called when we have finished servicing a mirrored
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void raid_end_bio_io(r10bio_t *r10_bio)
+{
+        struct bio *bio = r10_bio->master_bio;
+        bio_endio(bio, bio->bi_size,
+                test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
+        free_r10bio(r10_bio);
+}
+/*
+ * Update disk head position estimator based on IRQ completion info.
+ */
+static inline void update_head_pos(int slot, r10bio_t *r10_bio)
+{
+        conf_t *conf = mddev_to_conf(r10_bio->mddev);
+        conf->mirrors[r10_bio->devs[slot].devnum].head_position =
+                r10_bio->devs[slot].addr + (r10_bio->sectors);
+}
+static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
+{
+        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
+        int slot, dev;
+        conf_t *conf = mddev_to_conf(r10_bio->mddev);
+        if (bio->bi_size)
+                return 1;
+        slot = r10_bio->read_slot;
+        dev = r10_bio->devs[slot].devnum;
+        /*
+         * this branch is our 'one mirror IO has finished' event handler:
+         */
+        if (!uptodate)
+                md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
+        else
+                /*
+                 * Set R10BIO_Uptodate in our master bio, so that
+                 * we will return a good error code to the higher
+                 * levels even if IO on some other mirrored buffer fails.
+                 *
+                 * The 'master' represents the composite IO operation to
+                 * user-side. So if something waits for IO, then it will
+                 * wait for the 'master' bio.
+                 */
+                set_bit(R10BIO_Uptodate, &r10_bio->state);
+        update_head_pos(slot, r10_bio);
+        /*
+         * we have only one bio on the read side
+         */
+        if (uptodate)
+                raid_end_bio_io(r10_bio);
+        else {
+                /*
+                 * oops, read error:
+                 */
+                char b[BDEVNAME_SIZE];
+                if (printk_ratelimit())
+                        printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",
+                               bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
+                reschedule_retry(r10_bio);
+        }
+        rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
+        return 0;
+}
+static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
+{
+        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
+        int slot, dev;
+        conf_t *conf = mddev_to_conf(r10_bio->mddev);
+        if (bio->bi_size)
+                return 1;
+        for (slot = 0; slot < conf->copies; slot++)
+                if (r10_bio->devs[slot].bio == bio)
+                        break;
+        dev = r10_bio->devs[slot].devnum;
+        /*
+         * this branch is our 'one mirror IO has finished' event handler:
+         */
+        if (!uptodate)
+                md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
+        else
+                /*
+                 * Set R10BIO_Uptodate in our master bio, so that
+                 * we will return a good error code for to the higher
+                 * levels even if IO on some other mirrored buffer fails.
+                 *
+                 * The 'master' represents the composite IO operation to
+                 * user-side. So if something waits for IO, then it will
+                 * wait for the 'master' bio.
+                 */
+                set_bit(R10BIO_Uptodate, &r10_bio->state);
+        update_head_pos(slot, r10_bio);
+        /*
+         *
+         * Let's see if all mirrored write operations have finished
+         * already.
+         */
+        if (atomic_dec_and_test(&r10_bio->remaining)) {
+                md_write_end(r10_bio->mddev);
+                raid_end_bio_io(r10_bio);
+        }
+        rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
+        return 0;
+}
+/*
+ * RAID10 layout manager
+ * Aswell as the chunksize and raid_disks count, there are two
+ * parameters: near_copies and far_copies.
+ * near_copies * far_copies must be <= raid_disks.
+ * Normally one of these will be 1.
+ * If both are 1, we get raid0.
+ * If near_copies == raid_disks, we get raid1.
+ *
+ * Chunks are layed out in raid0 style with near_copies copies of the
+ * first chunk, followed by near_copies copies of the next chunk and
+ * so on.
+ * If far_copies > 1, then after 1/far_copies of the array has been assigned
+ * as described above, we start again with a device offset of near_copies.
+ * So we effectively have another copy of the whole array further down all
+ * the drives, but with blocks on different drives.
+ * With this layout, and block is never stored twice on the one device.
+ *
+ * raid10_find_phys finds the sector offset of a given virtual sector
+ * on each device that it is on. If a block isn't on a device,
+ * that entry in the array is set to MaxSector.
+ *
+ * raid10_find_virt does the reverse mapping, from a device and a
+ * sector offset to a virtual address
+ */
+static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
+{
+        int n,f;
+        sector_t sector;
+        sector_t chunk;
+        sector_t stripe;
+        int dev;
+        int slot = 0;
+        /* now calculate first sector/dev */
+        chunk = r10bio->sector >> conf->chunk_shift;
+        sector = r10bio->sector & conf->chunk_mask;
+        chunk *= conf->near_copies;
+        stripe = chunk;
+        dev = sector_div(stripe, conf->raid_disks);
+        sector += stripe << conf->chunk_shift;
+        /* and calculate all the others */
+        for (n=0; n < conf->near_copies; n++) {
+                int d = dev;
+                sector_t s = sector;
+                r10bio->devs[slot].addr = sector;
+                r10bio->devs[slot].devnum = d;
+                slot++;
+                for (f = 1; f < conf->far_copies; f++) {
+                        d += conf->near_copies;
+                        if (d >= conf->raid_disks)
+                                d -= conf->raid_disks;
+                        s += conf->stride;
+                        r10bio->devs[slot].devnum = d;
+                        r10bio->devs[slot].addr = s;
+                        slot++;
+                }
+                dev++;
+                if (dev >= conf->raid_disks) {
+                        dev = 0;
+                        sector += (conf->chunk_mask + 1);
+                }
+        }
+        BUG_ON(slot != conf->copies);
+}
+static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
+{
+        sector_t offset, chunk, vchunk;
+        while (sector > conf->stride) {
+                sector -= conf->stride;
+                if (dev < conf->near_copies)
+                        dev += conf->raid_disks - conf->near_copies;
+                else
+                        dev -= conf->near_copies;
+        }
+        offset = sector & conf->chunk_mask;
+        chunk = sector >> conf->chunk_shift;
+        vchunk = chunk * conf->raid_disks + dev;
+        sector_div(vchunk, conf->near_copies);
+        return (vchunk << conf->chunk_shift) + offset;
+}
+/**
+ *      raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
+ *      @q: request queue
+ *      @bio: the buffer head that's been built up so far
+ *      @biovec: the request that could be merged to it.
+ *
+ *      Return amount of bytes we can accept at this offset
+ *      If near_copies == raid_disk, there are no striping issues,
+ *      but in that case, the function isn't called at all.
+ */
+static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio,
+                                struct bio_vec *bio_vec)
+{
+        mddev_t *mddev = q->queuedata;
+        sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+        int max;
+        unsigned int chunk_sectors = mddev->chunk_size >> 9;
+        unsigned int bio_sectors = bio->bi_size >> 9;
+        max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
+        if (max < 0) max = 0; /* bio_add cannot handle a negative return */
+        if (max <= bio_vec->bv_len && bio_sectors == 0)
+                return bio_vec->bv_len;
+        else
+                return max;
+}
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. There is a per-array 'next expected sequential IO' sector
+ * number - if this matches on the next IO then we use the last disk.
+ * There is also a per-disk 'last know head position' sector that is
+ * maintained from IRQ contexts, both the normal and the resync IO
+ * completion handlers update this position correctly. If there is no
+ * perfect sequential match then we pick the disk whose head is closest.
+ *
+ * If there are 2 mirrors in the same 2 devices, performance degrades
+ * because position is mirror, not device based.
+ *
+ * The rdev for the device selected will have nr_pending incremented.
+ */
+/*
+ * FIXME: possibly should rethink readbalancing and do it differently
+ * depending on near_copies / far_copies geometry.
+ */
+static int read_balance(conf_t *conf, r10bio_t *r10_bio)
+{
+        const unsigned long this_sector = r10_bio->sector;
+        int disk, slot, nslot;
+        const int sectors = r10_bio->sectors;
+        sector_t new_distance, current_distance;
+        raid10_find_phys(conf, r10_bio);
+        rcu_read_lock();
+        /*
+         * Check if we can balance. We can balance on the whole
+         * device if no resync is going on, or below the resync window.
+         * We take the first readable disk when above the resync window.
+         */
+        if (conf->mddev->recovery_cp < MaxSector
+            && (this_sector + sectors >= conf->next_resync)) {
+                /* make sure that disk is operational */
+                slot = 0;
+                disk = r10_bio->devs[slot].devnum;
+                while (!conf->mirrors[disk].rdev ||
+                       !conf->mirrors[disk].rdev->in_sync) {
+                        slot++;
+                        if (slot == conf->copies) {
+                                slot = 0;
+                                disk = -1;
+                                break;
+                        }
+                        disk = r10_bio->devs[slot].devnum;
+                }
+                goto rb_out;
+        }
+        /* make sure the disk is operational */
+        slot = 0;
+        disk = r10_bio->devs[slot].devnum;
+        while (!conf->mirrors[disk].rdev ||
+               !conf->mirrors[disk].rdev->in_sync) {
+                slot ++;
+                if (slot == conf->copies) {
+                        disk = -1;
+                        goto rb_out;
+                }
+                disk = r10_bio->devs[slot].devnum;
+        }
+        current_distance = abs(this_sector - conf->mirrors[disk].head_position);
+        /* Find the disk whose head is closest */
+        for (nslot = slot; nslot < conf->copies; nslot++) {
+                int ndisk = r10_bio->devs[nslot].devnum;
+                if (!conf->mirrors[ndisk].rdev ||
+                    !conf->mirrors[ndisk].rdev->in_sync)
+                        continue;
+                if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) {
+                        disk = ndisk;
+                        slot = nslot;
+                        break;
+                }
+                new_distance = abs(r10_bio->devs[nslot].addr -
+                                   conf->mirrors[ndisk].head_position);
+                if (new_distance < current_distance) {
+                        current_distance = new_distance;
+                        disk = ndisk;
+                        slot = nslot;
+                }
+        }
+rb_out:
+        r10_bio->read_slot = slot;
+/*      conf->next_seq_sect = this_sector + sectors;*/
+        if (disk >= 0 && conf->mirrors[disk].rdev)
+                atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
+        rcu_read_unlock();
+        return disk;
+}
+static void unplug_slaves(mddev_t *mddev)
+{
+        conf_t *conf = mddev_to_conf(mddev);
+        int i;
+        rcu_read_lock();
+        for (i=0; i<mddev->raid_disks; i++) {
+                mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+                if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
+                        request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+                        atomic_inc(&rdev->nr_pending);
+                        rcu_read_unlock();
+                        if (r_queue->unplug_fn)
+                                r_queue->unplug_fn(r_queue);
+                        rdev_dec_pending(rdev, mddev);
+                        rcu_read_lock();
+                }
+        }
+        rcu_read_unlock();
+}
+static void raid10_unplug(request_queue_t *q)
+{
+        unplug_slaves(q->queuedata);
+}
+static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
+                             sector_t *error_sector)
+{
+        mddev_t *mddev = q->queuedata;
+        conf_t *conf = mddev_to_conf(mddev);
+        int i, ret = 0;
+        rcu_read_lock();
+        for (i=0; i<mddev->raid_disks && ret == 0; i++) {
+                mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+                if (rdev && !rdev->faulty) {
+                        struct block_device *bdev = rdev->bdev;
+                        request_queue_t *r_queue = bdev_get_queue(bdev);
+                        if (!r_queue->issue_flush_fn)
+                                ret = -EOPNOTSUPP;
+                        else {
+                                atomic_inc(&rdev->nr_pending);
+                                rcu_read_unlock();
+                                ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
+                                                              error_sector);
+                                rdev_dec_pending(rdev, mddev);
+                                rcu_read_lock();
+                        }
+                }
+        }
+        rcu_read_unlock();
+        return ret;
+}
+/*
+ * Throttle resync depth, so that we can both get proper overlapping of
+ * requests, but are still able to handle normal requests quickly.
+ */
+#define RESYNC_DEPTH 32
+static void device_barrier(conf_t *conf, sector_t sect)
+{
+        spin_lock_irq(&conf->resync_lock);
+        wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
+                            conf->resync_lock, unplug_slaves(conf->mddev));
+        if (!conf->barrier++) {
+                wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
+                                    conf->resync_lock, unplug_slaves(conf->mddev));
+                if (conf->nr_pending)
+                        BUG();
+        }
+        wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
+                            conf->resync_lock, unplug_slaves(conf->mddev));
+        conf->next_resync = sect;
+        spin_unlock_irq(&conf->resync_lock);
+}
+static int make_request(request_queue_t *q, struct bio * bio)
+{
+        mddev_t *mddev = q->queuedata;
+        conf_t *conf = mddev_to_conf(mddev);
+        mirror_info_t *mirror;
+        r10bio_t *r10_bio;
+        struct bio *read_bio;
+        int i;
+        int chunk_sects = conf->chunk_mask + 1;
+        /* If this request crosses a chunk boundary, we need to
+         * split it.  This will only happen for 1 PAGE (or less) requests.
+         */
+        if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
+                      > chunk_sects &&
+                    conf->near_copies < conf->raid_disks)) {
+                struct bio_pair *bp;
+                /* Sanity check -- queue functions should prevent this happening */
+                if (bio->bi_vcnt != 1 ||
+                    bio->bi_idx != 0)
+                        goto bad_map;
+                /* This is a one page bio that upper layers
+                 * refuse to split for us, so we need to split it.
+                 */
+                bp = bio_split(bio, bio_split_pool,
+                               chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
+                if (make_request(q, &bp->bio1))
+                        generic_make_request(&bp->bio1);
+                if (make_request(q, &bp->bio2))
+                        generic_make_request(&bp->bio2);
+                bio_pair_release(bp);
+                return 0;
+        bad_map:
+                printk("raid10_make_request bug: can't convert block across chunks"
+                       " or bigger than %dk %llu %d\n", chunk_sects/2,
+                       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+                bio_io_error(bio, bio->bi_size);
+                return 0;
+        }
+        /*
+         * Register the new request and wait if the reconstruction
+         * thread has put up a bar for new requests.
+         * Continue immediately if no resync is active currently.
+         */
+        spin_lock_irq(&conf->resync_lock);
+        wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
+        conf->nr_pending++;
+        spin_unlock_irq(&conf->resync_lock);
+        if (bio_data_dir(bio)==WRITE) {
+                disk_stat_inc(mddev->gendisk, writes);
+                disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
+        } else {
+                disk_stat_inc(mddev->gendisk, reads);
+                disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
+        }
+        r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+        r10_bio->master_bio = bio;
+        r10_bio->sectors = bio->bi_size >> 9;
+        r10_bio->mddev = mddev;
+        r10_bio->sector = bio->bi_sector;
+        if (bio_data_dir(bio) == READ) {
+                /*
+                 * read balancing logic:
+                 */
+                int disk = read_balance(conf, r10_bio);
+                int slot = r10_bio->read_slot;
+                if (disk < 0) {
+                        raid_end_bio_io(r10_bio);
+                        return 0;
+                }
+                mirror = conf->mirrors + disk;
+                read_bio = bio_clone(bio, GFP_NOIO);
+                r10_bio->devs[slot].bio = read_bio;
+                read_bio->bi_sector = r10_bio->devs[slot].addr +
+                        mirror->rdev->data_offset;
+                read_bio->bi_bdev = mirror->rdev->bdev;
+                read_bio->bi_end_io = raid10_end_read_request;
+                read_bio->bi_rw = READ;
+                read_bio->bi_private = r10_bio;
+                generic_make_request(read_bio);
+                return 0;
+        }
+        /*
+         * WRITE:
+         */
+        /* first select target devices under spinlock and
+         * inc refcount on their rdev.  Record them by setting
+         * bios[x] to bio
+         */
+        raid10_find_phys(conf, r10_bio);
+        rcu_read_lock();
+        for (i = 0;  i < conf->copies; i++) {
+                int d = r10_bio->devs[i].devnum;
+                if (conf->mirrors[d].rdev &&
+                    !conf->mirrors[d].rdev->faulty) {
+                        atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+                        r10_bio->devs[i].bio = bio;
+                } else
+                        r10_bio->devs[i].bio = NULL;
+        }
+        rcu_read_unlock();
+        atomic_set(&r10_bio->remaining, 1);
+        md_write_start(mddev);
+        for (i = 0; i < conf->copies; i++) {
+                struct bio *mbio;
+                int d = r10_bio->devs[i].devnum;
+                if (!r10_bio->devs[i].bio)
+                        continue;
+                mbio = bio_clone(bio, GFP_NOIO);
+                r10_bio->devs[i].bio = mbio;
+                mbio->bi_sector = r10_bio->devs[i].addr+
+                        conf->mirrors[d].rdev->data_offset;
+                mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
+                mbio->bi_end_io = raid10_end_write_request;
+                mbio->bi_rw = WRITE;
+                mbio->bi_private = r10_bio;
+                atomic_inc(&r10_bio->remaining);
+                generic_make_request(mbio);
+        }
+        if (atomic_dec_and_test(&r10_bio->remaining)) {
+                md_write_end(mddev);
+                raid_end_bio_io(r10_bio);
+        }
+        return 0;
+}
+static void status(struct seq_file *seq, mddev_t *mddev)
+{
+        conf_t *conf = mddev_to_conf(mddev);
+        int i;
+        if (conf->near_copies < conf->raid_disks)
+                seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
+        if (conf->near_copies > 1)
+                seq_printf(seq, " %d near-copies", conf->near_copies);
+        if (conf->far_copies > 1)
+                seq_printf(seq, " %d far-copies", conf->far_copies);
+        seq_printf(seq, " [%d/%d] [", conf->raid_disks,
+                                                conf->working_disks);
+        for (i = 0; i < conf->raid_disks; i++)
+                seq_printf(seq, "%s",
+                              conf->mirrors[i].rdev &&
+                              conf->mirrors[i].rdev->in_sync ? "U" : "_");
+        seq_printf(seq, "]");
+}
+static void error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        char b[BDEVNAME_SIZE];
+        conf_t *conf = mddev_to_conf(mddev);
+        /*
+         * If it is not operational, then we have already marked it as dead
+         * else if it is the last working disks, ignore the error, let the
+         * next level up know.
+         * else mark the drive as failed
+         */
+        if (rdev->in_sync
+            && conf->working_disks == 1)
+                /*
+                 * Don't fail the drive, just return an IO error.
+                 * The test should really be more sophisticated than
+                 * "working_disks == 1", but it isn't critical, and
+                 * can wait until we do more sophisticated "is the drive
+                 * really dead" tests...
+                 */
+                return;
+        if (rdev->in_sync) {
+                mddev->degraded++;
+                conf->working_disks--;
+                /*
+                 * if recovery is running, make sure it aborts.
+                 */
+                set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+        }
+        rdev->in_sync = 0;
+        rdev->faulty = 1;
+        mddev->sb_dirty = 1;
+        printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"
+                "       Operation continuing on %d devices\n",
+                bdevname(rdev->bdev,b), conf->working_disks);
+}
+static void print_conf(conf_t *conf)
+{
+        int i;
+        mirror_info_t *tmp;
+        printk("RAID10 conf printout:\n");
+        if (!conf) {
+                printk("(!conf)\n");
+                return;
+        }
+        printk(" --- wd:%d rd:%d\n", conf->working_disks,
+                conf->raid_disks);
+        for (i = 0; i < conf->raid_disks; i++) {
+                char b[BDEVNAME_SIZE];
+                tmp = conf->mirrors + i;
+                if (tmp->rdev)
+                        printk(" disk %d, wo:%d, o:%d, dev:%s\n",
+                                i, !tmp->rdev->in_sync, !tmp->rdev->faulty,
+                                bdevname(tmp->rdev->bdev,b));
+        }
+}
+static void close_sync(conf_t *conf)
+{
+        spin_lock_irq(&conf->resync_lock);
+        wait_event_lock_irq(conf->wait_resume, !conf->barrier,
+                            conf->resync_lock,  unplug_slaves(conf->mddev));
+        spin_unlock_irq(&conf->resync_lock);
+        if (conf->barrier) BUG();
+        if (waitqueue_active(&conf->wait_idle)) BUG();
+        mempool_destroy(conf->r10buf_pool);
+        conf->r10buf_pool = NULL;
+}
+static int raid10_spare_active(mddev_t *mddev)
+{
+        int i;
+        conf_t *conf = mddev->private;
+        mirror_info_t *tmp;
+        /*
+         * Find all non-in_sync disks within the RAID10 configuration
+         * and mark them in_sync
+         */
+        for (i = 0; i < conf->raid_disks; i++) {
+                tmp = conf->mirrors + i;
+                if (tmp->rdev
+                    && !tmp->rdev->faulty
+                    && !tmp->rdev->in_sync) {
+                        conf->working_disks++;
+                        mddev->degraded--;
+                        tmp->rdev->in_sync = 1;
+                }
+        }
+        print_conf(conf);
+        return 0;
+}
+static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        conf_t *conf = mddev->private;
+        int found = 0;
+        int mirror;
+        mirror_info_t *p;
+        if (mddev->recovery_cp < MaxSector)
+                /* only hot-add to in-sync arrays, as recovery is
+                 * very different from resync
+                 */
+                return 0;
+        for (mirror=0; mirror < mddev->raid_disks; mirror++)
+                if ( !(p=conf->mirrors+mirror)->rdev) {
+                        blk_queue_stack_limits(mddev->queue,
+                                               rdev->bdev->bd_disk->queue);
+                        /* as we don't honour merge_bvec_fn, we must never risk
+                         * violating it, so limit ->max_sector to one PAGE, as
+                         * a one page request is never in violation.
+                         */
+                        if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+                            mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                                mddev->queue->max_sectors = (PAGE_SIZE>>9);
+                        p->head_position = 0;
+                        rdev->raid_disk = mirror;
+                        found = 1;
+                        p->rdev = rdev;
+                        break;
+                }
+        print_conf(conf);
+        return found;
+}
+static int raid10_remove_disk(mddev_t *mddev, int number)
+{
+        conf_t *conf = mddev->private;
+        int err = 0;
+        mdk_rdev_t *rdev;
+        mirror_info_t *p = conf->mirrors+ number;
+        print_conf(conf);
+        rdev = p->rdev;
+        if (rdev) {
+                if (rdev->in_sync ||
+                    atomic_read(&rdev->nr_pending)) {
+                        err = -EBUSY;
+                        goto abort;
+                }
+                p->rdev = NULL;
+                synchronize_kernel();
+                if (atomic_read(&rdev->nr_pending)) {
+                        /* lost the race, try later */
+                        err = -EBUSY;
+                        p->rdev = rdev;
+                }
+        }
+abort:
+        print_conf(conf);
+        return err;
+}
+static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
+{
+        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
+        conf_t *conf = mddev_to_conf(r10_bio->mddev);
+        int i,d;
+        if (bio->bi_size)
+                return 1;
+        for (i=0; i<conf->copies; i++)
+                if (r10_bio->devs[i].bio == bio)
+                        break;
+        if (i == conf->copies)
+                BUG();
+        update_head_pos(i, r10_bio);
+        d = r10_bio->devs[i].devnum;
+        if (!uptodate)
+                md_error(r10_bio->mddev,
+                         conf->mirrors[d].rdev);
+        /* for reconstruct, we always reschedule after a read.
+         * for resync, only after all reads
+         */
+        if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
+            atomic_dec_and_test(&r10_bio->remaining)) {
+                /* we have read all the blocks,
+                 * do the comparison in process context in raid10d
+                 */
+                reschedule_retry(r10_bio);
+        }
+        rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
+        return 0;
+}
+static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
+{
+        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
+        mddev_t *mddev = r10_bio->mddev;
+        conf_t *conf = mddev_to_conf(mddev);
+        int i,d;
+        if (bio->bi_size)
+                return 1;
+        for (i = 0; i < conf->copies; i++)
+                if (r10_bio->devs[i].bio == bio)
+                        break;
+        d = r10_bio->devs[i].devnum;
+        if (!uptodate)
+                md_error(mddev, conf->mirrors[d].rdev);
+        update_head_pos(i, r10_bio);
+        while (atomic_dec_and_test(&r10_bio->remaining)) {
+                if (r10_bio->master_bio == NULL) {
+                        /* the primary of several recovery bios */
+                        md_done_sync(mddev, r10_bio->sectors, 1);
+                        put_buf(r10_bio);
+                        break;
+                } else {
+                        r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
+                        put_buf(r10_bio);
+                        r10_bio = r10_bio2;
+                }
+        }
+        rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+        return 0;
+}
+/*
+ * Note: sync and recover and handled very differently for raid10
+ * This code is for resync.
+ * For resync, we read through virtual addresses and read all blocks.
+ * If there is any error, we schedule a write.  The lowest numbered
+ * drive is authoritative.
+ * However requests come for physical address, so we need to map.
+ * For every physical address there are raid_disks/copies virtual addresses,
+ * which is always are least one, but is not necessarly an integer.
+ * This means that a physical address can span multiple chunks, so we may
+ * have to submit multiple io requests for a single sync request.
+ */
+/*
+ * We check if all blocks are in-sync and only write to blocks that
+ * aren't in sync
+ */
+static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
+{
+        conf_t *conf = mddev_to_conf(mddev);
+        int i, first;
+        struct bio *tbio, *fbio;
+        atomic_set(&r10_bio->remaining, 1);
+        /* find the first device with a block */
+        for (i=0; i<conf->copies; i++)
+                if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
+                        break;
+        if (i == conf->copies)
+                goto done;
+        first = i;
+        fbio = r10_bio->devs[i].bio;
+        /* now find blocks with errors */
+        for (i=first+1 ; i < conf->copies ; i++) {
+                int vcnt, j, d;
+                if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
+                        continue;
+                /* We know that the bi_io_vec layout is the same for
+                 * both 'first' and 'i', so we just compare them.
+                 * All vec entries are PAGE_SIZE;
+                 */
+                tbio = r10_bio->devs[i].bio;
+                vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
+                for (j = 0; j < vcnt; j++)
+                        if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
+                                   page_address(tbio->bi_io_vec[j].bv_page),
+                                   PAGE_SIZE))
+                                break;
+                if (j == vcnt)
+                        continue;
+                /* Ok, we need to write this bio
+                 * First we need to fixup bv_offset, bv_len and
+                 * bi_vecs, as the read request might have corrupted these
+                 */
+                tbio->bi_vcnt = vcnt;
+                tbio->bi_size = r10_bio->sectors << 9;
+                tbio->bi_idx = 0;
+                tbio->bi_phys_segments = 0;
+                tbio->bi_hw_segments = 0;
+                tbio->bi_hw_front_size = 0;
+                tbio->bi_hw_back_size = 0;
+                tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
+                tbio->bi_flags |= 1 << BIO_UPTODATE;
+                tbio->bi_next = NULL;
+                tbio->bi_rw = WRITE;
+                tbio->bi_private = r10_bio;
+                tbio->bi_sector = r10_bio->devs[i].addr;
+                for (j=0; j < vcnt ; j++) {
+                        tbio->bi_io_vec[j].bv_offset = 0;
+                        tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
+                        memcpy(page_address(tbio->bi_io_vec[j].bv_page),
+                               page_address(fbio->bi_io_vec[j].bv_page),
+                               PAGE_SIZE);
+                }
+                tbio->bi_end_io = end_sync_write;
+                d = r10_bio->devs[i].devnum;
+                atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+                atomic_inc(&r10_bio->remaining);
+                md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
+                tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
+                tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
+                generic_make_request(tbio);
+        }
+done:
+        if (atomic_dec_and_test(&r10_bio->remaining)) {
+                md_done_sync(mddev, r10_bio->sectors, 1);
+                put_buf(r10_bio);
+        }
+}
+/*
+ * Now for the recovery code.
+ * Recovery happens across physical sectors.
+ * We recover all non-is_sync drives by finding the virtual address of
+ * each, and then choose a working drive that also has that virt address.
+ * There is a separate r10_bio for each non-in_sync drive.
+ * Only the first two slots are in use. The first for reading,
+ * The second for writing.
+ *
+ */
+static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
+{
+        conf_t *conf = mddev_to_conf(mddev);
+        int i, d;
+        struct bio *bio, *wbio;
+        /* move the pages across to the second bio
+         * and submit the write request
+         */
+        bio = r10_bio->devs[0].bio;
+        wbio = r10_bio->devs[1].bio;
+        for (i=0; i < wbio->bi_vcnt; i++) {
+                struct page *p = bio->bi_io_vec[i].bv_page;
+                bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
+                wbio->bi_io_vec[i].bv_page = p;
+        }
+        d = r10_bio->devs[1].devnum;
+        atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+        md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
+        generic_make_request(wbio);
+}
+/*
+ * This is a kernel thread which:
+ *
+ *      1.      Retries failed read operations on working mirrors.
+ *      2.      Updates the raid superblock when problems encounter.
+ *      3.      Performs writes following reads for array syncronising.
+ */
+static void raid10d(mddev_t *mddev)
+{
+        r10bio_t *r10_bio;
+        struct bio *bio;
+        unsigned long flags;
+        conf_t *conf = mddev_to_conf(mddev);
+        struct list_head *head = &conf->retry_list;
+        int unplug=0;
+        mdk_rdev_t *rdev;
+        md_check_recovery(mddev);
+        md_handle_safemode(mddev);
+        for (;;) {
+                char b[BDEVNAME_SIZE];
+                spin_lock_irqsave(&conf->device_lock, flags);
+                if (list_empty(head))
+                        break;
+                r10_bio = list_entry(head->prev, r10bio_t, retry_list);
+                list_del(head->prev);
+                spin_unlock_irqrestore(&conf->device_lock, flags);
+                mddev = r10_bio->mddev;
+                conf = mddev_to_conf(mddev);
+                if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
+                        sync_request_write(mddev, r10_bio);
+                        unplug = 1;
+                } else  if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
+                        recovery_request_write(mddev, r10_bio);
+                        unplug = 1;
+                } else {
+                        int mirror;
+                        bio = r10_bio->devs[r10_bio->read_slot].bio;
+                        r10_bio->devs[r10_bio->read_slot].bio = NULL;
+                        bio_put(bio);
+                        mirror = read_balance(conf, r10_bio);
+                        if (mirror == -1) {
+                                printk(KERN_ALERT "raid10: %s: unrecoverable I/O"
+                                       " read error for block %llu\n",
+                                       bdevname(bio->bi_bdev,b),
+                                       (unsigned long long)r10_bio->sector);
+                                raid_end_bio_io(r10_bio);
+                        } else {
+                                rdev = conf->mirrors[mirror].rdev;
+                                if (printk_ratelimit())
+                                        printk(KERN_ERR "raid10: %s: redirecting sector %llu to"
+                                               " another mirror\n",
+                                               bdevname(rdev->bdev,b),
+                                               (unsigned long long)r10_bio->sector);
+                                bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
+                                r10_bio->devs[r10_bio->read_slot].bio = bio;
+                                bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
+                                        + rdev->data_offset;
+                                bio->bi_bdev = rdev->bdev;
+                                bio->bi_rw = READ;
+                                bio->bi_private = r10_bio;
+                                bio->bi_end_io = raid10_end_read_request;
+                                unplug = 1;
+                                generic_make_request(bio);
+                        }
+                }
+        }
+        spin_unlock_irqrestore(&conf->device_lock, flags);
+        if (unplug)
+                unplug_slaves(mddev);
+}
+static int init_resync(conf_t *conf)
+{
+        int buffs;
+        buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
+        if (conf->r10buf_pool)
+                BUG();
+        conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
+        if (!conf->r10buf_pool)
+                return -ENOMEM;
+        conf->next_resync = 0;
+        return 0;
+}
+/*
+ * perform a "sync" on one "block"
+ *
+ * We need to make sure that no normal I/O request - particularly write
+ * requests - conflict with active sync requests.
+ *
+ * This is achieved by tracking pending requests and a 'barrier' concept
+ * that can be installed to exclude normal IO requests.
+ *
+ * Resync and recovery are handled very differently.
+ * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
+ *
+ * For resync, we iterate over virtual addresses, read all copies,
+ * and update if there are differences.  If only one copy is live,
+ * skip it.
+ * For recovery, we iterate over physical addresses, read a good
+ * value for each non-in_sync drive, and over-write.
+ *
+ * So, for recovery we may have several outstanding complex requests for a
+ * given address, one for each out-of-sync device.  We model this by allocating
+ * a number of r10_bio structures, one for each out-of-sync device.
+ * As we setup these structures, we collect all bio's together into a list
+ * which we then process collectively to add pages, and then process again
+ * to pass to generic_make_request.
+ *
+ * The r10_bio structures are linked using a borrowed master_bio pointer.
+ * This link is counted in ->remaining.  When the r10_bio that points to NULL
+ * has its remaining count decremented to 0, the whole complex operation
+ * is complete.
+ *
+ */
+static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
+{
+        conf_t *conf = mddev_to_conf(mddev);
+        r10bio_t *r10_bio;
+        struct bio *biolist = NULL, *bio;
+        sector_t max_sector, nr_sectors;
+        int disk;
+        int i;
+        sector_t sectors_skipped = 0;
+        int chunks_skipped = 0;
+        if (!conf->r10buf_pool)
+                if (init_resync(conf))
+                        return -ENOMEM;
+ skipped:
+        max_sector = mddev->size << 1;
+        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+                max_sector = mddev->resync_max_sectors;
+        if (sector_nr >= max_sector) {
+                close_sync(conf);
+                return sectors_skipped;
+        }
+        if (chunks_skipped >= conf->raid_disks) {
+                /* if there has been nothing to do on any drive,
+                 * then there is nothing to do at all..
+                 */
+                sector_t sec = max_sector - sector_nr;
+                md_done_sync(mddev, sec, 1);
+                return sec + sectors_skipped;
+        }
+        /* make sure whole request will fit in a chunk - if chunks
+         * are meaningful
+         */
+        if (conf->near_copies < conf->raid_disks &&
+            max_sector > (sector_nr | conf->chunk_mask))
+                max_sector = (sector_nr | conf->chunk_mask) + 1;
+        /*
+         * If there is non-resync activity waiting for us then
+         * put in a delay to throttle resync.
+         */
+        if (!go_faster && waitqueue_active(&conf->wait_resume))
+                msleep_interruptible(1000);
+        device_barrier(conf, sector_nr + RESYNC_SECTORS);
+        /* Again, very different code for resync and recovery.
+         * Both must result in an r10bio with a list of bios that
+         * have bi_end_io, bi_sector, bi_bdev set,
+         * and bi_private set to the r10bio.
+         * For recovery, we may actually create several r10bios
+         * with 2 bios in each, that correspond to the bios in the main one.
+         * In this case, the subordinate r10bios link back through a
+         * borrowed master_bio pointer, and the counter in the master
+         * includes a ref from each subordinate.
+         */
+        /* First, we decide what to do and set ->bi_end_io
+         * To end_sync_read if we want to read, and
+         * end_sync_write if we will want to write.
+         */
+        if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+                /* recovery... the complicated one */
+                int i, j, k;
+                r10_bio = NULL;
+                for (i=0 ; i<conf->raid_disks; i++)
+                        if (conf->mirrors[i].rdev &&
+                            !conf->mirrors[i].rdev->in_sync) {
+                                /* want to reconstruct this device */
+                                r10bio_t *rb2 = r10_bio;
+                                r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+                                spin_lock_irq(&conf->resync_lock);
+                                conf->nr_pending++;
+                                if (rb2) conf->barrier++;
+                                spin_unlock_irq(&conf->resync_lock);
+                                atomic_set(&r10_bio->remaining, 0);
+                                r10_bio->master_bio = (struct bio*)rb2;
+                                if (rb2)
+                                        atomic_inc(&rb2->remaining);
+                                r10_bio->mddev = mddev;
+                                set_bit(R10BIO_IsRecover, &r10_bio->state);
+                                r10_bio->sector = raid10_find_virt(conf, sector_nr, i);
+                                raid10_find_phys(conf, r10_bio);
+                                for (j=0; j<conf->copies;j++) {
+                                        int d = r10_bio->devs[j].devnum;
+                                        if (conf->mirrors[d].rdev &&
+                                            conf->mirrors[d].rdev->in_sync) {
+                                                /* This is where we read from */
+                                                bio = r10_bio->devs[0].bio;
+                                                bio->bi_next = biolist;
+                                                biolist = bio;
+                                                bio->bi_private = r10_bio;
+                                                bio->bi_end_io = end_sync_read;
+                                                bio->bi_rw = 0;
+                                                bio->bi_sector = r10_bio->devs[j].addr +
+                                                        conf->mirrors[d].rdev->data_offset;
+                                                bio->bi_bdev = conf->mirrors[d].rdev->bdev;
+                                                atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+                                                atomic_inc(&r10_bio->remaining);
+                                                /* and we write to 'i' */
+                                                for (k=0; k<conf->copies; k++)
+                                                        if (r10_bio->devs[k].devnum == i)
+                                                                break;
+                                                bio = r10_bio->devs[1].bio;
+                                                bio->bi_next = biolist;
+                                                biolist = bio;
+                                                bio->bi_private = r10_bio;
+                                                bio->bi_end_io = end_sync_write;
+                                                bio->bi_rw = 1;
+                                                bio->bi_sector = r10_bio->devs[k].addr +
+                                                        conf->mirrors[i].rdev->data_offset;
+                                                bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+                                                r10_bio->devs[0].devnum = d;
+                                                r10_bio->devs[1].devnum = i;
+                                                break;
+                                        }
+                                }
+                                if (j == conf->copies) {
+                                        BUG();
+                                }
+                        }
+                if (biolist == NULL) {
+                        while (r10_bio) {
+                                r10bio_t *rb2 = r10_bio;
+                                r10_bio = (r10bio_t*) rb2->master_bio;
+                                rb2->master_bio = NULL;
+                                put_buf(rb2);
+                        }
+                        goto giveup;
+                }
+        } else {
+                /* resync. Schedule a read for every block at this virt offset */
+                int count = 0;
+                r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+                spin_lock_irq(&conf->resync_lock);
+                conf->nr_pending++;
+                spin_unlock_irq(&conf->resync_lock);
+                r10_bio->mddev = mddev;
+                atomic_set(&r10_bio->remaining, 0);
+                r10_bio->master_bio = NULL;
+                r10_bio->sector = sector_nr;
+                set_bit(R10BIO_IsSync, &r10_bio->state);
+                raid10_find_phys(conf, r10_bio);
+                r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
+                for (i=0; i<conf->copies; i++) {
+                        int d = r10_bio->devs[i].devnum;
+                        bio = r10_bio->devs[i].bio;
+                        bio->bi_end_io = NULL;
+                        if (conf->mirrors[d].rdev == NULL ||
+                            conf->mirrors[d].rdev->faulty)
+                                continue;
+                        atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+                        atomic_inc(&r10_bio->remaining);
+                        bio->bi_next = biolist;
+                        biolist = bio;
+                        bio->bi_private = r10_bio;
+                        bio->bi_end_io = end_sync_read;
+                        bio->bi_rw = 0;
+                        bio->bi_sector = r10_bio->devs[i].addr +
+                                conf->mirrors[d].rdev->data_offset;
+                        bio->bi_bdev = conf->mirrors[d].rdev->bdev;
+                        count++;
+                }
+                if (count < 2) {
+                        for (i=0; i<conf->copies; i++) {
+                                int d = r10_bio->devs[i].devnum;
+                                if (r10_bio->devs[i].bio->bi_end_io)
+                                        rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+                        }
+                        put_buf(r10_bio);
+                        biolist = NULL;
+                        goto giveup;
+                }
+        }
+        for (bio = biolist; bio ; bio=bio->bi_next) {
+                bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+                if (bio->bi_end_io)
+                        bio->bi_flags |= 1 << BIO_UPTODATE;
+                bio->bi_vcnt = 0;
+                bio->bi_idx = 0;
+                bio->bi_phys_segments = 0;
+                bio->bi_hw_segments = 0;
+                bio->bi_size = 0;
+        }
+        nr_sectors = 0;
+        do {
+                struct page *page;
+                int len = PAGE_SIZE;
+                disk = 0;
+                if (sector_nr + (len>>9) > max_sector)
+                        len = (max_sector - sector_nr) << 9;
+                if (len == 0)
+                        break;
+                for (bio= biolist ; bio ; bio=bio->bi_next) {
+                        page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+                        if (bio_add_page(bio, page, len, 0) == 0) {
+                                /* stop here */
+                                struct bio *bio2;
+                                bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+                                for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
+                                        /* remove last page from this bio */
+                                        bio2->bi_vcnt--;
+                                        bio2->bi_size -= len;
+                                        bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
+                                }
+                                goto bio_full;
+                        }
+                        disk = i;
+                }
+                nr_sectors += len>>9;
+                sector_nr += len>>9;
+        } while (biolist->bi_vcnt < RESYNC_PAGES);
+ bio_full:
+        r10_bio->sectors = nr_sectors;
+        while (biolist) {
+                bio = biolist;
+                biolist = biolist->bi_next;
+                bio->bi_next = NULL;
+                r10_bio = bio->bi_private;
+                r10_bio->sectors = nr_sectors;
+                if (bio->bi_end_io == end_sync_read) {
+                        md_sync_acct(bio->bi_bdev, nr_sectors);
+                        generic_make_request(bio);
+                }
+        }
+        return sectors_skipped + nr_sectors;
+ giveup:
+        /* There is nowhere to write, so all non-sync
+         * drives must be failed, so try the next chunk...
+         */
+        {
+        int sec = max_sector - sector_nr;
+        sectors_skipped += sec;
+        chunks_skipped ++;
+        sector_nr = max_sector;
+        md_done_sync(mddev, sec, 1);
+        goto skipped;
+        }
+}
+static int run(mddev_t *mddev)
+{
+        conf_t *conf;
+        int i, disk_idx;
+        mirror_info_t *disk;
+        mdk_rdev_t *rdev;
+        struct list_head *tmp;
+        int nc, fc;
+        sector_t stride, size;
+        if (mddev->level != 10) {
+                printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n",
+                       mdname(mddev), mddev->level);
+                goto out;
+        }
+        nc = mddev->layout & 255;
+        fc = (mddev->layout >> 8) & 255;
+        if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
+            (mddev->layout >> 16)) {
+                printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
+                       mdname(mddev), mddev->layout);
+                goto out;
+        }
+        /*
+         * copy the already verified devices into our private RAID10
+         * bookkeeping area. [whatever we allocate in run(),
+         * should be freed in stop()]
+         */
+        conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
+        mddev->private = conf;
+        if (!conf) {
+                printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
+                        mdname(mddev));
+                goto out;
+        }
+        memset(conf, 0, sizeof(*conf));
+        conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
+                                 GFP_KERNEL);
+        if (!conf->mirrors) {
+                printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
+                       mdname(mddev));
+                goto out_free_conf;
+        }
+        memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
+        conf->near_copies = nc;
+        conf->far_copies = fc;
+        conf->copies = nc*fc;
+        conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
+        conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
+        stride = mddev->size >> (conf->chunk_shift-1);
+        sector_div(stride, fc);
+        conf->stride = stride << conf->chunk_shift;
+        conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
+                                                r10bio_pool_free, conf);
+        if (!conf->r10bio_pool) {
+                printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
+                        mdname(mddev));
+                goto out_free_conf;
+        }
+        mddev->queue->unplug_fn = raid10_unplug;
+        mddev->queue->issue_flush_fn = raid10_issue_flush;
+        ITERATE_RDEV(mddev, rdev, tmp) {
+                disk_idx = rdev->raid_disk;
+                if (disk_idx >= mddev->raid_disks
+                    || disk_idx < 0)
+                        continue;
+                disk = conf->mirrors + disk_idx;
+                disk->rdev = rdev;
+                blk_queue_stack_limits(mddev->queue,
+                                       rdev->bdev->bd_disk->queue);
+                /* as we don't honour merge_bvec_fn, we must never risk
+                 * violating it, so limit ->max_sector to one PAGE, as
+                 * a one page request is never in violation.
+                 */
+                if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                        mddev->queue->max_sectors = (PAGE_SIZE>>9);
+                disk->head_position = 0;
+                if (!rdev->faulty && rdev->in_sync)
+                        conf->working_disks++;
+        }
+        conf->raid_disks = mddev->raid_disks;
+        conf->mddev = mddev;
+        spin_lock_init(&conf->device_lock);
+        INIT_LIST_HEAD(&conf->retry_list);
+        spin_lock_init(&conf->resync_lock);
+        init_waitqueue_head(&conf->wait_idle);
+        init_waitqueue_head(&conf->wait_resume);
+        if (!conf->working_disks) {
+                printk(KERN_ERR "raid10: no operational mirrors for %s\n",
+                        mdname(mddev));
+                goto out_free_conf;
+        }
+        mddev->degraded = 0;
+        for (i = 0; i < conf->raid_disks; i++) {
+                disk = conf->mirrors + i;
+                if (!disk->rdev) {
+                        disk->head_position = 0;
+                        mddev->degraded++;
+                }
+        }
+        mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10");
+        if (!mddev->thread) {
+                printk(KERN_ERR
+                       "raid10: couldn't allocate thread for %s\n",
+                       mdname(mddev));
+                goto out_free_conf;
+        }
+        printk(KERN_INFO
+                "raid10: raid set %s active with %d out of %d devices\n",
+                mdname(mddev), mddev->raid_disks - mddev->degraded,
+                mddev->raid_disks);
+        /*
+         * Ok, everything is just fine now
+         */
+        size = conf->stride * conf->raid_disks;
+        sector_div(size, conf->near_copies);
+        mddev->array_size = size/2;
+        mddev->resync_max_sectors = size;
+        /* Calculate max read-ahead size.
+         * We need to readahead at least twice a whole stripe....
+         * maybe...
+         */
+        {
+                int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE;
+                stripe /= conf->near_copies;
+                if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
+                        mddev->queue->backing_dev_info.ra_pages = 2* stripe;
+        }
+        if (conf->near_copies < mddev->raid_disks)
+                blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
+        return 0;
+out_free_conf:
+        if (conf->r10bio_pool)
+                mempool_destroy(conf->r10bio_pool);
+        if (conf->mirrors)
+                kfree(conf->mirrors);
+        kfree(conf);
+        mddev->private = NULL;
+out:
+        return -EIO;
+}
+static int stop(mddev_t *mddev)
+{
+        conf_t *conf = mddev_to_conf(mddev);
+        md_unregister_thread(mddev->thread);
+        mddev->thread = NULL;
+        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+        if (conf->r10bio_pool)
+                mempool_destroy(conf->r10bio_pool);
+        if (conf->mirrors)
+                kfree(conf->mirrors);
+        kfree(conf);
+        mddev->private = NULL;
+        return 0;
+}
+static mdk_personality_t raid10_personality =
+{
+        .name           = "raid10",
+        .owner          = THIS_MODULE,
+        .make_request   = make_request,
+        .run            = run,
+        .stop           = stop,
+        .status         = status,
+        .error_handler  = error,
+        .hot_add_disk   = raid10_add_disk,
+        .hot_remove_disk= raid10_remove_disk,
+        .spare_active   = raid10_spare_active,
+        .sync_request   = sync_request,
+};
+static int __init raid_init(void)
+{
+        return register_md_personality(RAID10, &raid10_personality);
+}
+static void raid_exit(void)
+{
+        unregister_md_personality(RAID10);
+}
+module_init(raid_init);
+module_exit(raid_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-9"); /* RAID10 */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
new file mode 100644
index 000000000000..52c3a81c4aa7
--- /dev/null
+++ b/drivers/md/raid5.c
@@ -0,0 +1,1965 @@
+/*
+ * raid5.c : Multiple Devices driver for Linux
+ *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *         Copyright (C) 1999, 2000 Ingo Molnar
+ *
+ * RAID-5 management functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/raid/raid5.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <asm/atomic.h>
+/*
+ * Stripe cache
+ */
+#define NR_STRIPES              256
+#define STRIPE_SIZE             PAGE_SIZE
+#define STRIPE_SHIFT            (PAGE_SHIFT - 9)
+#define STRIPE_SECTORS          (STRIPE_SIZE>>9)
+#define IO_THRESHOLD            1
+#define HASH_PAGES              1
+#define HASH_PAGES_ORDER        0
+#define NR_HASH                 (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
+#define HASH_MASK               (NR_HASH - 1)
+#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])
+/* bio's attached to a stripe+device for I/O are linked together in bi_sector
+ * order without overlap.  There may be several bio's per stripe+device, and
+ * a bio could span several devices.
+ * When walking this list for a particular stripe+device, we must never proceed
+ * beyond a bio that extends past this device, as the next bio might no longer
+ * be valid.
+ * This macro is used to determine the 'next' bio in the list, given the sector
+ * of the current stripe+device
+ */
+#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
+/*
+ * The following can be used to debug the driver
+ */
+#define RAID5_DEBUG     0
+#define RAID5_PARANOIA  1
+#if RAID5_PARANOIA && defined(CONFIG_SMP)
+# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
+#else
+# define CHECK_DEVLOCK()
+#endif
+#define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x)))
+#if RAID5_DEBUG
+#define inline
+#define __inline__
+#endif
+static void print_raid5_conf (raid5_conf_t *conf);
+static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
+{
+        if (atomic_dec_and_test(&sh->count)) {
+                if (!list_empty(&sh->lru))
+                        BUG();
+                if (atomic_read(&conf->active_stripes)==0)
+                        BUG();
+                if (test_bit(STRIPE_HANDLE, &sh->state)) {
+                        if (test_bit(STRIPE_DELAYED, &sh->state))
+                                list_add_tail(&sh->lru, &conf->delayed_list);
+                        else
+                                list_add_tail(&sh->lru, &conf->handle_list);
+                        md_wakeup_thread(conf->mddev->thread);
+                } else {
+                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+                                atomic_dec(&conf->preread_active_stripes);
+                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+                                        md_wakeup_thread(conf->mddev->thread);
+                        }
+                        list_add_tail(&sh->lru, &conf->inactive_list);
+                        atomic_dec(&conf->active_stripes);
+                        if (!conf->inactive_blocked ||
+                            atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
+                                wake_up(&conf->wait_for_stripe);
+                }
+        }
+}
+static void release_stripe(struct stripe_head *sh)
+{
+        raid5_conf_t *conf = sh->raid_conf;
+        unsigned long flags;
+        
+        spin_lock_irqsave(&conf->device_lock, flags);
+        __release_stripe(conf, sh);
+        spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+static void remove_hash(struct stripe_head *sh)
+{
+        PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
+        if (sh->hash_pprev) {
+                if (sh->hash_next)
+                        sh->hash_next->hash_pprev = sh->hash_pprev;
+                *sh->hash_pprev = sh->hash_next;
+                sh->hash_pprev = NULL;
+        }
+}
+static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
+{
+        struct stripe_head **shp = &stripe_hash(conf, sh->sector);
+        PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
+        CHECK_DEVLOCK();
+        if ((sh->hash_next = *shp) != NULL)
+                (*shp)->hash_pprev = &sh->hash_next;
+        *shp = sh;
+        sh->hash_pprev = shp;
+}
+/* find an idle stripe, make sure it is unhashed, and return it. */
+static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
+{
+        struct stripe_head *sh = NULL;
+        struct list_head *first;
+        CHECK_DEVLOCK();
+        if (list_empty(&conf->inactive_list))
+                goto out;
+        first = conf->inactive_list.next;
+        sh = list_entry(first, struct stripe_head, lru);
+        list_del_init(first);
+        remove_hash(sh);
+        atomic_inc(&conf->active_stripes);
+out:
+        return sh;
+}
+static void shrink_buffers(struct stripe_head *sh, int num)
+{
+        struct page *p;
+        int i;
+        for (i=0; i<num ; i++) {
+                p = sh->dev[i].page;
+                if (!p)
+                        continue;
+                sh->dev[i].page = NULL;
+                page_cache_release(p);
+        }
+}
+static int grow_buffers(struct stripe_head *sh, int num)
+{
+        int i;
+        for (i=0; i<num; i++) {
+                struct page *page;
+                if (!(page = alloc_page(GFP_KERNEL))) {
+                        return 1;
+                }
+                sh->dev[i].page = page;
+        }
+        return 0;
+}
+static void raid5_build_block (struct stripe_head *sh, int i);
+static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
+{
+        raid5_conf_t *conf = sh->raid_conf;
+        int disks = conf->raid_disks, i;
+        if (atomic_read(&sh->count) != 0)
+                BUG();
+        if (test_bit(STRIPE_HANDLE, &sh->state))
+                BUG();
+        
+        CHECK_DEVLOCK();
+        PRINTK("init_stripe called, stripe %llu\n", 
+                (unsigned long long)sh->sector);
+        remove_hash(sh);
+        
+        sh->sector = sector;
+        sh->pd_idx = pd_idx;
+        sh->state = 0;
+        for (i=disks; i--; ) {
+                struct r5dev *dev = &sh->dev[i];
+                if (dev->toread || dev->towrite || dev->written ||
+                    test_bit(R5_LOCKED, &dev->flags)) {
+                        printk("sector=%llx i=%d %p %p %p %d\n",
+                               (unsigned long long)sh->sector, i, dev->toread,
+                               dev->towrite, dev->written,
+                               test_bit(R5_LOCKED, &dev->flags));
+                        BUG();
+                }
+                dev->flags = 0;
+                raid5_build_block(sh, i);
+        }
+        insert_hash(conf, sh);
+}
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
+{
+        struct stripe_head *sh;
+        CHECK_DEVLOCK();
+        PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
+        for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
+                if (sh->sector == sector)
+                        return sh;
+        PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
+        return NULL;
+}
+static void unplug_slaves(mddev_t *mddev);
+static void raid5_unplug_device(request_queue_t *q);
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector,
+                                             int pd_idx, int noblock) 
+{
+        struct stripe_head *sh;
+        PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
+        spin_lock_irq(&conf->device_lock);
+        do {
+                sh = __find_stripe(conf, sector);
+                if (!sh) {
+                        if (!conf->inactive_blocked)
+                                sh = get_free_stripe(conf);
+                        if (noblock && sh == NULL)
+                                break;
+                        if (!sh) {
+                                conf->inactive_blocked = 1;
+                                wait_event_lock_irq(conf->wait_for_stripe,
+                                                    !list_empty(&conf->inactive_list) &&
+                                                    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
+                                                     || !conf->inactive_blocked),
+                                                    conf->device_lock,
+                                                    unplug_slaves(conf->mddev);
+                                        );
+                                conf->inactive_blocked = 0;
+                        } else
+                                init_stripe(sh, sector, pd_idx);
+                } else {
+                        if (atomic_read(&sh->count)) {
+                                if (!list_empty(&sh->lru))
+                                        BUG();
+                        } else {
+                                if (!test_bit(STRIPE_HANDLE, &sh->state))
+                                        atomic_inc(&conf->active_stripes);
+                                if (list_empty(&sh->lru))
+                                        BUG();
+                                list_del_init(&sh->lru);
+                        }
+                }
+        } while (sh == NULL);
+        if (sh)
+                atomic_inc(&sh->count);
+        spin_unlock_irq(&conf->device_lock);
+        return sh;
+}
+static int grow_stripes(raid5_conf_t *conf, int num)
+{
+        struct stripe_head *sh;
+        kmem_cache_t *sc;
+        int devs = conf->raid_disks;
+        sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev));
+        sc = kmem_cache_create(conf->cache_name, 
+                               sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
+                               0, 0, NULL, NULL);
+        if (!sc)
+                return 1;
+        conf->slab_cache = sc;
+        while (num--) {
+                sh = kmem_cache_alloc(sc, GFP_KERNEL);
+                if (!sh)
+                        return 1;
+                memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
+                sh->raid_conf = conf;
+                spin_lock_init(&sh->lock);
+                if (grow_buffers(sh, conf->raid_disks)) {
+                        shrink_buffers(sh, conf->raid_disks);
+                        kmem_cache_free(sc, sh);
+                        return 1;
+                }
+                /* we just created an active stripe so... */
+                atomic_set(&sh->count, 1);
+                atomic_inc(&conf->active_stripes);
+                INIT_LIST_HEAD(&sh->lru);
+                release_stripe(sh);
+        }
+        return 0;
+}
+static void shrink_stripes(raid5_conf_t *conf)
+{
+        struct stripe_head *sh;
+        while (1) {
+                spin_lock_irq(&conf->device_lock);
+                sh = get_free_stripe(conf);
+                spin_unlock_irq(&conf->device_lock);
+                if (!sh)
+                        break;
+                if (atomic_read(&sh->count))
+                        BUG();
+                shrink_buffers(sh, conf->raid_disks);
+                kmem_cache_free(conf->slab_cache, sh);
+                atomic_dec(&conf->active_stripes);
+        }
+        kmem_cache_destroy(conf->slab_cache);
+        conf->slab_cache = NULL;
+}
+static int raid5_end_read_request (struct bio * bi, unsigned int bytes_done,
+                                   int error)
+{
+        struct stripe_head *sh = bi->bi_private;
+        raid5_conf_t *conf = sh->raid_conf;
+        int disks = conf->raid_disks, i;
+        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+        if (bi->bi_size)
+                return 1;
+        for (i=0 ; i<disks; i++)
+                if (bi == &sh->dev[i].req)
+                        break;
+        PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n", 
+                (unsigned long long)sh->sector, i, atomic_read(&sh->count), 
+                uptodate);
+        if (i == disks) {
+                BUG();
+                return 0;
+        }
+        if (uptodate) {
+#if 0
+                struct bio *bio;
+                unsigned long flags;
+                spin_lock_irqsave(&conf->device_lock, flags);
+                /* we can return a buffer if we bypassed the cache or
+                 * if the top buffer is not in highmem.  If there are
+                 * multiple buffers, leave the extra work to
+                 * handle_stripe
+                 */
+                buffer = sh->bh_read[i];
+                if (buffer &&
+                    (!PageHighMem(buffer->b_page)
+                     || buffer->b_page == bh->b_page )
+                        ) {
+                        sh->bh_read[i] = buffer->b_reqnext;
+                        buffer->b_reqnext = NULL;
+                } else
+                        buffer = NULL;
+                spin_unlock_irqrestore(&conf->device_lock, flags);
+                if (sh->bh_page[i]==bh->b_page)
+                        set_buffer_uptodate(bh);
+                if (buffer) {
+                        if (buffer->b_page != bh->b_page)
+                                memcpy(buffer->b_data, bh->b_data, bh->b_size);
+                        buffer->b_end_io(buffer, 1);
+                }
+#else
+                set_bit(R5_UPTODATE, &sh->dev[i].flags);
+#endif          
+        } else {
+                md_error(conf->mddev, conf->disks[i].rdev);
+                clear_bit(R5_UPTODATE, &sh->dev[i].flags);
+        }
+        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
+#if 0
+        /* must restore b_page before unlocking buffer... */
+        if (sh->bh_page[i] != bh->b_page) {
+                bh->b_page = sh->bh_page[i];
+                bh->b_data = page_address(bh->b_page);
+                clear_buffer_uptodate(bh);
+        }
+#endif
+        clear_bit(R5_LOCKED, &sh->dev[i].flags);
+        set_bit(STRIPE_HANDLE, &sh->state);
+        release_stripe(sh);
+        return 0;
+}
+static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
+                                    int error)
+{
+        struct stripe_head *sh = bi->bi_private;
+        raid5_conf_t *conf = sh->raid_conf;
+        int disks = conf->raid_disks, i;
+        unsigned long flags;
+        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+        if (bi->bi_size)
+                return 1;
+        for (i=0 ; i<disks; i++)
+                if (bi == &sh->dev[i].req)
+                        break;
+        PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n", 
+                (unsigned long long)sh->sector, i, atomic_read(&sh->count),
+                uptodate);
+        if (i == disks) {
+                BUG();
+                return 0;
+        }
+        spin_lock_irqsave(&conf->device_lock, flags);
+        if (!uptodate)
+                md_error(conf->mddev, conf->disks[i].rdev);
+        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
+        
+        clear_bit(R5_LOCKED, &sh->dev[i].flags);
+        set_bit(STRIPE_HANDLE, &sh->state);
+        __release_stripe(conf, sh);
+        spin_unlock_irqrestore(&conf->device_lock, flags);
+        return 0;
+}
+static sector_t compute_blocknr(struct stripe_head *sh, int i);
+        
+static void raid5_build_block (struct stripe_head *sh, int i)
+{
+        struct r5dev *dev = &sh->dev[i];
+        bio_init(&dev->req);
+        dev->req.bi_io_vec = &dev->vec;
+        dev->req.bi_vcnt++;
+        dev->req.bi_max_vecs++;
+        dev->vec.bv_page = dev->page;
+        dev->vec.bv_len = STRIPE_SIZE;
+        dev->vec.bv_offset = 0;
+        dev->req.bi_sector = sh->sector;
+        dev->req.bi_private = sh;
+        dev->flags = 0;
+        if (i != sh->pd_idx)
+                dev->sector = compute_blocknr(sh, i);
+}
+static void error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        char b[BDEVNAME_SIZE];
+        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+        PRINTK("raid5: error called\n");
+        if (!rdev->faulty) {
+                mddev->sb_dirty = 1;
+                if (rdev->in_sync) {
+                        conf->working_disks--;
+                        mddev->degraded++;
+                        conf->failed_disks++;
+                        rdev->in_sync = 0;
+                        /*
+                         * if recovery was running, make sure it aborts.
+                         */
+                        set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+                }
+                rdev->faulty = 1;
+                printk (KERN_ALERT
+                        "raid5: Disk failure on %s, disabling device."
+                        " Operation continuing on %d devices\n",
+                        bdevname(rdev->bdev,b), conf->working_disks);
+        }
+}       
+/*
+ * Input: a 'big' sector number,
+ * Output: index of the data and parity disk, and the sector # in them.
+ */
+static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
+                        unsigned int data_disks, unsigned int * dd_idx,
+                        unsigned int * pd_idx, raid5_conf_t *conf)
+{
+        long stripe;
+        unsigned long chunk_number;
+        unsigned int chunk_offset;
+        sector_t new_sector;
+        int sectors_per_chunk = conf->chunk_size >> 9;
+        /* First compute the information on this sector */
+        /*
+         * Compute the chunk number and the sector offset inside the chunk
+         */
+        chunk_offset = sector_div(r_sector, sectors_per_chunk);
+        chunk_number = r_sector;
+        BUG_ON(r_sector != chunk_number);
+        /*
+         * Compute the stripe number
+         */
+        stripe = chunk_number / data_disks;
+        /*
+         * Compute the data disk and parity disk indexes inside the stripe
+         */
+        *dd_idx = chunk_number % data_disks;
+        /*
+         * Select the parity disk based on the user selected algorithm.
+         */
+        if (conf->level == 4)
+                *pd_idx = data_disks;
+        else switch (conf->algorithm) {
+                case ALGORITHM_LEFT_ASYMMETRIC:
+                        *pd_idx = data_disks - stripe % raid_disks;
+                        if (*dd_idx >= *pd_idx)
+                                (*dd_idx)++;
+                        break;
+                case ALGORITHM_RIGHT_ASYMMETRIC:
+                        *pd_idx = stripe % raid_disks;
+                        if (*dd_idx >= *pd_idx)
+                                (*dd_idx)++;
+                        break;
+                case ALGORITHM_LEFT_SYMMETRIC:
+                        *pd_idx = data_disks - stripe % raid_disks;
+                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+                        break;
+                case ALGORITHM_RIGHT_SYMMETRIC:
+                        *pd_idx = stripe % raid_disks;
+                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+                        break;
+                default:
+                        printk("raid5: unsupported algorithm %d\n",
+                                conf->algorithm);
+        }
+        /*
+         * Finally, compute the new sector number
+         */
+        new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
+        return new_sector;
+}
+static sector_t compute_blocknr(struct stripe_head *sh, int i)
+{
+        raid5_conf_t *conf = sh->raid_conf;
+        int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
+        sector_t new_sector = sh->sector, check;
+        int sectors_per_chunk = conf->chunk_size >> 9;
+        sector_t stripe;
+        int chunk_offset;
+        int chunk_number, dummy1, dummy2, dd_idx = i;
+        sector_t r_sector;
+        chunk_offset = sector_div(new_sector, sectors_per_chunk);
+        stripe = new_sector;
+        BUG_ON(new_sector != stripe);
+        
+        switch (conf->algorithm) {
+                case ALGORITHM_LEFT_ASYMMETRIC:
+                case ALGORITHM_RIGHT_ASYMMETRIC:
+                        if (i > sh->pd_idx)
+                                i--;
+                        break;
+                case ALGORITHM_LEFT_SYMMETRIC:
+                case ALGORITHM_RIGHT_SYMMETRIC:
+                        if (i < sh->pd_idx)
+                                i += raid_disks;
+                        i -= (sh->pd_idx + 1);
+                        break;
+                default:
+                        printk("raid5: unsupported algorithm %d\n",
+                                conf->algorithm);
+        }
+        chunk_number = stripe * data_disks + i;
+        r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
+        check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
+        if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
+                printk("compute_blocknr: map not correct\n");
+                return 0;
+        }
+        return r_sector;
+}
+/*
+ * Copy data between a page in the stripe cache, and a bio.
+ * There are no alignment or size guarantees between the page or the
+ * bio except that there is some overlap.
+ * All iovecs in the bio must be considered.
+ */
+static void copy_data(int frombio, struct bio *bio,
+                     struct page *page,
+                     sector_t sector)
+{
+        char *pa = page_address(page);
+        struct bio_vec *bvl;
+        int i;
+        int page_offset;
+        if (bio->bi_sector >= sector)
+                page_offset = (signed)(bio->bi_sector - sector) * 512;
+        else
+                page_offset = (signed)(sector - bio->bi_sector) * -512;
+        bio_for_each_segment(bvl, bio, i) {
+                int len = bio_iovec_idx(bio,i)->bv_len;
+                int clen;
+                int b_offset = 0;
+                if (page_offset < 0) {
+                        b_offset = -page_offset;
+                        page_offset += b_offset;
+                        len -= b_offset;
+                }
+                if (len > 0 && page_offset + len > STRIPE_SIZE)
+                        clen = STRIPE_SIZE - page_offset;
+                else clen = len;
+                        
+                if (clen > 0) {
+                        char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
+                        if (frombio)
+                                memcpy(pa+page_offset, ba+b_offset, clen);
+                        else
+                                memcpy(ba+b_offset, pa+page_offset, clen);
+                        __bio_kunmap_atomic(ba, KM_USER0);
+                }
+                if (clen < len) /* hit end of page */
+                        break;
+                page_offset +=  len;
+        }
+}
+#define check_xor()     do {                                            \
+                           if (count == MAX_XOR_BLOCKS) {               \
+                                xor_block(count, STRIPE_SIZE, ptr);     \
+                                count = 1;                              \
+                           }                                            \
+                        } while(0)
+static void compute_block(struct stripe_head *sh, int dd_idx)
+{
+        raid5_conf_t *conf = sh->raid_conf;
+        int i, count, disks = conf->raid_disks;
+        void *ptr[MAX_XOR_BLOCKS], *p;
+        PRINTK("compute_block, stripe %llu, idx %d\n", 
+                (unsigned long long)sh->sector, dd_idx);
+        ptr[0] = page_address(sh->dev[dd_idx].page);
+        memset(ptr[0], 0, STRIPE_SIZE);
+        count = 1;
+        for (i = disks ; i--; ) {
+                if (i == dd_idx)
+                        continue;
+                p = page_address(sh->dev[i].page);
+                if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
+                        ptr[count++] = p;
+                else
+                        printk("compute_block() %d, stripe %llu, %d"
+                                " not present\n", dd_idx,
+                                (unsigned long long)sh->sector, i);
+                check_xor();
+        }
+        if (count != 1)
+                xor_block(count, STRIPE_SIZE, ptr);
+        set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
+}
+static void compute_parity(struct stripe_head *sh, int method)
+{
+        raid5_conf_t *conf = sh->raid_conf;
+        int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
+        void *ptr[MAX_XOR_BLOCKS];
+        struct bio *chosen;
+        PRINTK("compute_parity, stripe %llu, method %d\n",
+                (unsigned long long)sh->sector, method);
+        count = 1;
+        ptr[0] = page_address(sh->dev[pd_idx].page);
+        switch(method) {
+        case READ_MODIFY_WRITE:
+                if (!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags))
+                        BUG();
+                for (i=disks ; i-- ;) {
+                        if (i==pd_idx)
+                                continue;
+                        if (sh->dev[i].towrite &&
+                            test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
+                                ptr[count++] = page_address(sh->dev[i].page);
+                                chosen = sh->dev[i].towrite;
+                                sh->dev[i].towrite = NULL;
+                                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+                                        wake_up(&conf->wait_for_overlap);
+                                if (sh->dev[i].written) BUG();
+                                sh->dev[i].written = chosen;
+                                check_xor();
+                        }
+                }
+                break;
+        case RECONSTRUCT_WRITE:
+                memset(ptr[0], 0, STRIPE_SIZE);
+                for (i= disks; i-- ;)
+                        if (i!=pd_idx && sh->dev[i].towrite) {
+                                chosen = sh->dev[i].towrite;
+                                sh->dev[i].towrite = NULL;
+                                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+                                        wake_up(&conf->wait_for_overlap);
+                                if (sh->dev[i].written) BUG();
+                                sh->dev[i].written = chosen;
+                        }
+                break;
+        case CHECK_PARITY:
+                break;
+        }
+        if (count>1) {
+                xor_block(count, STRIPE_SIZE, ptr);
+                count = 1;
+        }
+        
+        for (i = disks; i--;)
+                if (sh->dev[i].written) {
+                        sector_t sector = sh->dev[i].sector;
+                        struct bio *wbi = sh->dev[i].written;
+                        while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+                                copy_data(1, wbi, sh->dev[i].page, sector);
+                                wbi = r5_next_bio(wbi, sector);
+                        }
+                        set_bit(R5_LOCKED, &sh->dev[i].flags);
+                        set_bit(R5_UPTODATE, &sh->dev[i].flags);
+                }
+        switch(method) {
+        case RECONSTRUCT_WRITE:
+        case CHECK_PARITY:
+                for (i=disks; i--;)
+                        if (i != pd_idx) {
+                                ptr[count++] = page_address(sh->dev[i].page);
+                                check_xor();
+                        }
+                break;
+        case READ_MODIFY_WRITE:
+                for (i = disks; i--;)
+                        if (sh->dev[i].written) {
+                                ptr[count++] = page_address(sh->dev[i].page);
+                                check_xor();
+                        }
+        }
+        if (count != 1)
+                xor_block(count, STRIPE_SIZE, ptr);
+        
+        if (method != CHECK_PARITY) {
+                set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+                set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
+        } else
+                clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+}
+/*
+ * Each stripe/dev can have one or more bion attached.
+ * toread/towrite point to the first in a chain. 
+ * The bi_next chain must be in order.
+ */
+static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+{
+        struct bio **bip;
+        raid5_conf_t *conf = sh->raid_conf;
+        PRINTK("adding bh b#%llu to stripe s#%llu\n",
+                (unsigned long long)bi->bi_sector,
+                (unsigned long long)sh->sector);
+        spin_lock(&sh->lock);
+        spin_lock_irq(&conf->device_lock);
+        if (forwrite)
+                bip = &sh->dev[dd_idx].towrite;
+        else
+                bip = &sh->dev[dd_idx].toread;
+        while (*bip && (*bip)->bi_sector < bi->bi_sector) {
+                if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
+                        goto overlap;
+                bip = & (*bip)->bi_next;
+        }
+        if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
+                goto overlap;
+        if (*bip && bi->bi_next && (*bip) != bi->bi_next)
+                BUG();
+        if (*bip)
+                bi->bi_next = *bip;
+        *bip = bi;
+        bi->bi_phys_segments ++;
+        spin_unlock_irq(&conf->device_lock);
+        spin_unlock(&sh->lock);
+        PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
+                (unsigned long long)bi->bi_sector,
+                (unsigned long long)sh->sector, dd_idx);
+        if (forwrite) {
+                /* check if page is covered */
+                sector_t sector = sh->dev[dd_idx].sector;
+                for (bi=sh->dev[dd_idx].towrite;
+                     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
+                             bi && bi->bi_sector <= sector;
+                     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
+                        if (bi->bi_sector + (bi->bi_size>>9) >= sector)
+                                sector = bi->bi_sector + (bi->bi_size>>9);
+                }
+                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
+                        set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
+        }
+        return 1;
+ overlap:
+        set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
+        spin_unlock_irq(&conf->device_lock);
+        spin_unlock(&sh->lock);
+        return 0;
+}
+/*
+ * handle_stripe - do things to a stripe.
+ *
+ * We lock the stripe and then examine the state of various bits
+ * to see what needs to be done.
+ * Possible results:
+ *    return some read request which now have data
+ *    return some write requests which are safely on disc
+ *    schedule a read on some buffers
+ *    schedule a write of some buffers
+ *    return confirmation of parity correctness
+ *
+ * Parity calculations are done inside the stripe lock
+ * buffers are taken off read_list or write_list, and bh_cache buffers
+ * get BH_Lock set before the stripe lock is released.
+ *
+ */
+ 
+static void handle_stripe(struct stripe_head *sh)
+{
+        raid5_conf_t *conf = sh->raid_conf;
+        int disks = conf->raid_disks;
+        struct bio *return_bi= NULL;
+        struct bio *bi;
+        int i;
+        int syncing;
+        int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
+        int non_overwrite = 0;
+        int failed_num=0;
+        struct r5dev *dev;
+        PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
+                (unsigned long long)sh->sector, atomic_read(&sh->count),
+                sh->pd_idx);
+        spin_lock(&sh->lock);
+        clear_bit(STRIPE_HANDLE, &sh->state);
+        clear_bit(STRIPE_DELAYED, &sh->state);
+        syncing = test_bit(STRIPE_SYNCING, &sh->state);
+        /* Now to look around and see what can be done */
+        for (i=disks; i--; ) {
+                mdk_rdev_t *rdev;
+                dev = &sh->dev[i];
+                clear_bit(R5_Insync, &dev->flags);
+                clear_bit(R5_Syncio, &dev->flags);
+                PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
+                        i, dev->flags, dev->toread, dev->towrite, dev->written);
+                /* maybe we can reply to a read */
+                if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
+                        struct bio *rbi, *rbi2;
+                        PRINTK("Return read for disc %d\n", i);
+                        spin_lock_irq(&conf->device_lock);
+                        rbi = dev->toread;
+                        dev->toread = NULL;
+                        if (test_and_clear_bit(R5_Overlap, &dev->flags))
+                                wake_up(&conf->wait_for_overlap);
+                        spin_unlock_irq(&conf->device_lock);
+                        while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+                                copy_data(0, rbi, dev->page, dev->sector);
+                                rbi2 = r5_next_bio(rbi, dev->sector);
+                                spin_lock_irq(&conf->device_lock);
+                                if (--rbi->bi_phys_segments == 0) {
+                                        rbi->bi_next = return_bi;
+                                        return_bi = rbi;
+                                }
+                                spin_unlock_irq(&conf->device_lock);
+                                rbi = rbi2;
+                        }
+                }
+                /* now count some things */
+                if (test_bit(R5_LOCKED, &dev->flags)) locked++;
+                if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+                
+                if (dev->toread) to_read++;
+                if (dev->towrite) {
+                        to_write++;
+                        if (!test_bit(R5_OVERWRITE, &dev->flags))
+                                non_overwrite++;
+                }
+                if (dev->written) written++;
+                rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
+                if (!rdev || !rdev->in_sync) {
+                        failed++;
+                        failed_num = i;
+                } else
+                        set_bit(R5_Insync, &dev->flags);
+        }
+        PRINTK("locked=%d uptodate=%d to_read=%d"
+                " to_write=%d failed=%d failed_num=%d\n",
+                locked, uptodate, to_read, to_write, failed, failed_num);
+        /* check if the array has lost two devices and, if so, some requests might
+         * need to be failed
+         */
+        if (failed > 1 && to_read+to_write+written) {
+                spin_lock_irq(&conf->device_lock);
+                for (i=disks; i--; ) {
+                        /* fail all writes first */
+                        bi = sh->dev[i].towrite;
+                        sh->dev[i].towrite = NULL;
+                        if (bi) to_write--;
+                        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+                                wake_up(&conf->wait_for_overlap);
+                        while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+                                struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                                if (--bi->bi_phys_segments == 0) {
+                                        md_write_end(conf->mddev);
+                                        bi->bi_next = return_bi;
+                                        return_bi = bi;
+                                }
+                                bi = nextbi;
+                        }
+                        /* and fail all 'written' */
+                        bi = sh->dev[i].written;
+                        sh->dev[i].written = NULL;
+                        while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
+                                struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
+                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                                if (--bi->bi_phys_segments == 0) {
+                                        md_write_end(conf->mddev);
+                                        bi->bi_next = return_bi;
+                                        return_bi = bi;
+                                }
+                                bi = bi2;
+                        }
+                        /* fail any reads if this device is non-operational */
+                        if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
+                                bi = sh->dev[i].toread;
+                                sh->dev[i].toread = NULL;
+                                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+                                        wake_up(&conf->wait_for_overlap);
+                                if (bi) to_read--;
+                                while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+                                        struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+                                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                                        if (--bi->bi_phys_segments == 0) {
+                                                bi->bi_next = return_bi;
+                                                return_bi = bi;
+                                        }
+                                        bi = nextbi;
+                                }
+                        }
+                }
+                spin_unlock_irq(&conf->device_lock);
+        }
+        if (failed > 1 && syncing) {
+                md_done_sync(conf->mddev, STRIPE_SECTORS,0);
+                clear_bit(STRIPE_SYNCING, &sh->state);
+                syncing = 0;
+        }
+        /* might be able to return some write requests if the parity block
+         * is safe, or on a failed drive
+         */
+        dev = &sh->dev[sh->pd_idx];
+        if ( written &&
+             ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
+                test_bit(R5_UPTODATE, &dev->flags))
+               || (failed == 1 && failed_num == sh->pd_idx))
+            ) {
+            /* any written block on an uptodate or failed drive can be returned.
+             * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 
+             * never LOCKED, so we don't need to test 'failed' directly.
+             */
+            for (i=disks; i--; )
+                if (sh->dev[i].written) {
+                    dev = &sh->dev[i];
+                    if (!test_bit(R5_LOCKED, &dev->flags) &&
+                         test_bit(R5_UPTODATE, &dev->flags) ) {
+                        /* We can return any write requests */
+                            struct bio *wbi, *wbi2;
+                            PRINTK("Return write for disc %d\n", i);
+                            spin_lock_irq(&conf->device_lock);
+                            wbi = dev->written;
+                            dev->written = NULL;
+                            while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+                                    wbi2 = r5_next_bio(wbi, dev->sector);
+                                    if (--wbi->bi_phys_segments == 0) {
+                                            md_write_end(conf->mddev);
+                                            wbi->bi_next = return_bi;
+                                            return_bi = wbi;
+                                    }
+                                    wbi = wbi2;
+                            }
+                            spin_unlock_irq(&conf->device_lock);
+                    }
+                }
+        }
+        /* Now we might consider reading some blocks, either to check/generate
+         * parity, or to satisfy requests
+         * or to load a block that is being partially written.
+         */
+        if (to_read || non_overwrite || (syncing && (uptodate < disks))) {
+                for (i=disks; i--;) {
+                        dev = &sh->dev[i];
+                        if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+                            (dev->toread ||
+                             (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+                             syncing ||
+                             (failed && (sh->dev[failed_num].toread ||
+                                         (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
+                                    )
+                                ) {
+                                /* we would like to get this block, possibly
+                                 * by computing it, but we might not be able to
+                                 */
+                                if (uptodate == disks-1) {
+                                        PRINTK("Computing block %d\n", i);
+                                        compute_block(sh, i);
+                                        uptodate++;
+                                } else if (test_bit(R5_Insync, &dev->flags)) {
+                                        set_bit(R5_LOCKED, &dev->flags);
+                                        set_bit(R5_Wantread, &dev->flags);
+#if 0
+                                        /* if I am just reading this block and we don't have
+                                           a failed drive, or any pending writes then sidestep the cache */
+                                        if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
+                                            ! syncing && !failed && !to_write) {
+                                                sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
+                                                sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
+                                        }
+#endif
+                                        locked++;
+                                        PRINTK("Reading block %d (sync=%d)\n", 
+                                                i, syncing);
+                                        if (syncing)
+                                                md_sync_acct(conf->disks[i].rdev->bdev,
+                                                             STRIPE_SECTORS);
+                                }
+                        }
+                }
+                set_bit(STRIPE_HANDLE, &sh->state);
+        }
+        /* now to consider writing and what else, if anything should be read */
+        if (to_write) {
+                int rmw=0, rcw=0;
+                for (i=disks ; i--;) {
+                        /* would I have to read this buffer for read_modify_write */
+                        dev = &sh->dev[i];
+                        if ((dev->towrite || i == sh->pd_idx) &&
+                            (!test_bit(R5_LOCKED, &dev->flags) 
+#if 0
+|| sh->bh_page[i]!=bh->b_page
+#endif
+                                    ) &&
+                            !test_bit(R5_UPTODATE, &dev->flags)) {
+                                if (test_bit(R5_Insync, &dev->flags)
+/*                                  && !(!mddev->insync && i == sh->pd_idx) */
+                                        )
+                                        rmw++;
+                                else rmw += 2*disks;  /* cannot read it */
+                        }
+                        /* Would I have to read this buffer for reconstruct_write */
+                        if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+                            (!test_bit(R5_LOCKED, &dev->flags) 
+#if 0
+|| sh->bh_page[i] != bh->b_page
+#endif
+                                    ) &&
+                            !test_bit(R5_UPTODATE, &dev->flags)) {
+                                if (test_bit(R5_Insync, &dev->flags)) rcw++;
+                                else rcw += 2*disks;
+                        }
+                }
+                PRINTK("for sector %llu, rmw=%d rcw=%d\n", 
+                        (unsigned long long)sh->sector, rmw, rcw);
+                set_bit(STRIPE_HANDLE, &sh->state);
+                if (rmw < rcw && rmw > 0)
+                        /* prefer read-modify-write, but need to get some data */
+                        for (i=disks; i--;) {
+                                dev = &sh->dev[i];
+                                if ((dev->towrite || i == sh->pd_idx) &&
+                                    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+                                    test_bit(R5_Insync, &dev->flags)) {
+                                        if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                                        {
+                                                PRINTK("Read_old block %d for r-m-w\n", i);
+                                                set_bit(R5_LOCKED, &dev->flags);
+                                                set_bit(R5_Wantread, &dev->flags);
+                                                locked++;
+                                        } else {
+                                                set_bit(STRIPE_DELAYED, &sh->state);
+                                                set_bit(STRIPE_HANDLE, &sh->state);
+                                        }
+                                }
+                        }
+                if (rcw <= rmw && rcw > 0)
+                        /* want reconstruct write, but need to get some data */
+                        for (i=disks; i--;) {
+                                dev = &sh->dev[i];
+                                if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+                                    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+                                    test_bit(R5_Insync, &dev->flags)) {
+                                        if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                                        {
+                                                PRINTK("Read_old block %d for Reconstruct\n", i);
+                                                set_bit(R5_LOCKED, &dev->flags);
+                                                set_bit(R5_Wantread, &dev->flags);
+                                                locked++;
+                                        } else {
+                                                set_bit(STRIPE_DELAYED, &sh->state);
+                                                set_bit(STRIPE_HANDLE, &sh->state);
+                                        }
+                                }
+                        }
+                /* now if nothing is locked, and if we have enough data, we can start a write request */
+                if (locked == 0 && (rcw == 0 ||rmw == 0)) {
+                        PRINTK("Computing parity...\n");
+                        compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
+                        /* now every locked buffer is ready to be written */
+                        for (i=disks; i--;)
+                                if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
+                                        PRINTK("Writing block %d\n", i);
+                                        locked++;
+                                        set_bit(R5_Wantwrite, &sh->dev[i].flags);
+                                        if (!test_bit(R5_Insync, &sh->dev[i].flags)
+                                            || (i==sh->pd_idx && failed == 0))
+                                                set_bit(STRIPE_INSYNC, &sh->state);
+                                }
+                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+                                atomic_dec(&conf->preread_active_stripes);
+                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+                                        md_wakeup_thread(conf->mddev->thread);
+                        }
+                }
+        }
+        /* maybe we need to check and possibly fix the parity for this stripe
+         * Any reads will already have been scheduled, so we just see if enough data
+         * is available
+         */
+        if (syncing && locked == 0 &&
+            !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
+                set_bit(STRIPE_HANDLE, &sh->state);
+                if (failed == 0) {
+                        char *pagea;
+                        if (uptodate != disks)
+                                BUG();
+                        compute_parity(sh, CHECK_PARITY);
+                        uptodate--;
+                        pagea = page_address(sh->dev[sh->pd_idx].page);
+                        if ((*(u32*)pagea) == 0 &&
+                            !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
+                                /* parity is correct (on disc, not in buffer any more) */
+                                set_bit(STRIPE_INSYNC, &sh->state);
+                        }
+                }
+                if (!test_bit(STRIPE_INSYNC, &sh->state)) {
+                        if (failed==0)
+                                failed_num = sh->pd_idx;
+                        /* should be able to compute the missing block and write it to spare */
+                        if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) {
+                                if (uptodate+1 != disks)
+                                        BUG();
+                                compute_block(sh, failed_num);
+                                uptodate++;
+                        }
+                        if (uptodate != disks)
+                                BUG();
+                        dev = &sh->dev[failed_num];
+                        set_bit(R5_LOCKED, &dev->flags);
+                        set_bit(R5_Wantwrite, &dev->flags);
+                        locked++;
+                        set_bit(STRIPE_INSYNC, &sh->state);
+                        set_bit(R5_Syncio, &dev->flags);
+                }
+        }
+        if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+                md_done_sync(conf->mddev, STRIPE_SECTORS,1);
+                clear_bit(STRIPE_SYNCING, &sh->state);
+        }
+        
+        spin_unlock(&sh->lock);
+        while ((bi=return_bi)) {
+                int bytes = bi->bi_size;
+                return_bi = bi->bi_next;
+                bi->bi_next = NULL;
+                bi->bi_size = 0;
+                bi->bi_end_io(bi, bytes, 0);
+        }
+        for (i=disks; i-- ;) {
+                int rw;
+                struct bio *bi;
+                mdk_rdev_t *rdev;
+                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
+                        rw = 1;
+                else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+                        rw = 0;
+                else
+                        continue;
+ 
+                bi = &sh->dev[i].req;
+ 
+                bi->bi_rw = rw;
+                if (rw)
+                        bi->bi_end_io = raid5_end_write_request;
+                else
+                        bi->bi_end_io = raid5_end_read_request;
+ 
+                rcu_read_lock();
+                rdev = conf->disks[i].rdev;
+                if (rdev && rdev->faulty)
+                        rdev = NULL;
+                if (rdev)
+                        atomic_inc(&rdev->nr_pending);
+                rcu_read_unlock();
+ 
+                if (rdev) {
+                        if (test_bit(R5_Syncio, &sh->dev[i].flags))
+                                md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+                        bi->bi_bdev = rdev->bdev;
+                        PRINTK("for %llu schedule op %ld on disc %d\n",
+                                (unsigned long long)sh->sector, bi->bi_rw, i);
+                        atomic_inc(&sh->count);
+                        bi->bi_sector = sh->sector + rdev->data_offset;
+                        bi->bi_flags = 1 << BIO_UPTODATE;
+                        bi->bi_vcnt = 1;        
+                        bi->bi_max_vecs = 1;
+                        bi->bi_idx = 0;
+                        bi->bi_io_vec = &sh->dev[i].vec;
+                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+                        bi->bi_io_vec[0].bv_offset = 0;
+                        bi->bi_size = STRIPE_SIZE;
+                        bi->bi_next = NULL;
+                        generic_make_request(bi);
+                } else {
+                        PRINTK("skip op %ld on disc %d for sector %llu\n",
+                                bi->bi_rw, i, (unsigned long long)sh->sector);
+                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
+                        set_bit(STRIPE_HANDLE, &sh->state);
+                }
+        }
+}
+static inline void raid5_activate_delayed(raid5_conf_t *conf)
+{
+        if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
+                while (!list_empty(&conf->delayed_list)) {
+                        struct list_head *l = conf->delayed_list.next;
+                        struct stripe_head *sh;
+                        sh = list_entry(l, struct stripe_head, lru);
+                        list_del_init(l);
+                        clear_bit(STRIPE_DELAYED, &sh->state);
+                        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                                atomic_inc(&conf->preread_active_stripes);
+                        list_add_tail(&sh->lru, &conf->handle_list);
+                }
+        }
+}
+static void unplug_slaves(mddev_t *mddev)
+{
+        raid5_conf_t *conf = mddev_to_conf(mddev);
+        int i;
+        rcu_read_lock();
+        for (i=0; i<mddev->raid_disks; i++) {
+                mdk_rdev_t *rdev = conf->disks[i].rdev;
+                if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
+                        request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+                        atomic_inc(&rdev->nr_pending);
+                        rcu_read_unlock();
+                        if (r_queue->unplug_fn)
+                                r_queue->unplug_fn(r_queue);
+                        rdev_dec_pending(rdev, mddev);
+                        rcu_read_lock();
+                }
+        }
+        rcu_read_unlock();
+}
+static void raid5_unplug_device(request_queue_t *q)
+{
+        mddev_t *mddev = q->queuedata;
+        raid5_conf_t *conf = mddev_to_conf(mddev);
+        unsigned long flags;
+        spin_lock_irqsave(&conf->device_lock, flags);
+        if (blk_remove_plug(q))
+                raid5_activate_delayed(conf);
+        md_wakeup_thread(mddev->thread);
+        spin_unlock_irqrestore(&conf->device_lock, flags);
+        unplug_slaves(mddev);
+}
+static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
+                             sector_t *error_sector)
+{
+        mddev_t *mddev = q->queuedata;
+        raid5_conf_t *conf = mddev_to_conf(mddev);
+        int i, ret = 0;
+        rcu_read_lock();
+        for (i=0; i<mddev->raid_disks && ret == 0; i++) {
+                mdk_rdev_t *rdev = conf->disks[i].rdev;
+                if (rdev && !rdev->faulty) {
+                        struct block_device *bdev = rdev->bdev;
+                        request_queue_t *r_queue = bdev_get_queue(bdev);
+                        if (!r_queue->issue_flush_fn)
+                                ret = -EOPNOTSUPP;
+                        else {
+                                atomic_inc(&rdev->nr_pending);
+                                rcu_read_unlock();
+                                ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
+                                                              error_sector);
+                                rdev_dec_pending(rdev, mddev);
+                                rcu_read_lock();
+                        }
+                }
+        }
+        rcu_read_unlock();
+        return ret;
+}
+static inline void raid5_plug_device(raid5_conf_t *conf)
+{
+        spin_lock_irq(&conf->device_lock);
+        blk_plug_device(conf->mddev->queue);
+        spin_unlock_irq(&conf->device_lock);
+}
+static int make_request (request_queue_t *q, struct bio * bi)
+{
+        mddev_t *mddev = q->queuedata;
+        raid5_conf_t *conf = mddev_to_conf(mddev);
+        const unsigned int raid_disks = conf->raid_disks;
+        const unsigned int data_disks = raid_disks - 1;
+        unsigned int dd_idx, pd_idx;
+        sector_t new_sector;
+        sector_t logical_sector, last_sector;
+        struct stripe_head *sh;
+        if (bio_data_dir(bi)==WRITE) {
+                disk_stat_inc(mddev->gendisk, writes);
+                disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
+        } else {
+                disk_stat_inc(mddev->gendisk, reads);
+                disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi));
+        }
+        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+        last_sector = bi->bi_sector + (bi->bi_size>>9);
+        bi->bi_next = NULL;
+        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
+        if ( bio_data_dir(bi) == WRITE )
+                md_write_start(mddev);
+        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+                DEFINE_WAIT(w);
+                
+                new_sector = raid5_compute_sector(logical_sector,
+                                                  raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+                PRINTK("raid5: make_request, sector %llu logical %llu\n",
+                        (unsigned long long)new_sector, 
+                        (unsigned long long)logical_sector);
+        retry:
+                prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+                sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
+                if (sh) {
+                        if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+                                /* Add failed due to overlap.  Flush everything
+                                 * and wait a while
+                                 */
+                                raid5_unplug_device(mddev->queue);
+                                release_stripe(sh);
+                                schedule();
+                                goto retry;
+                        }
+                        finish_wait(&conf->wait_for_overlap, &w);
+                        raid5_plug_device(conf);
+                        handle_stripe(sh);
+                        release_stripe(sh);
+                } else {
+                        /* cannot get stripe for read-ahead, just give-up */
+                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                        finish_wait(&conf->wait_for_overlap, &w);
+                        break;
+                }
+                        
+        }
+        spin_lock_irq(&conf->device_lock);
+        if (--bi->bi_phys_segments == 0) {
+                int bytes = bi->bi_size;
+                if ( bio_data_dir(bi) == WRITE )
+                        md_write_end(mddev);
+                bi->bi_size = 0;
+                bi->bi_end_io(bi, bytes, 0);
+        }
+        spin_unlock_irq(&conf->device_lock);
+        return 0;
+}
+/* FIXME go_faster isn't used */
+static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
+{
+        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+        struct stripe_head *sh;
+        int sectors_per_chunk = conf->chunk_size >> 9;
+        sector_t x;
+        unsigned long stripe;
+        int chunk_offset;
+        int dd_idx, pd_idx;
+        sector_t first_sector;
+        int raid_disks = conf->raid_disks;
+        int data_disks = raid_disks-1;
+        if (sector_nr >= mddev->size <<1) {
+                /* just being told to finish up .. nothing much to do */
+                unplug_slaves(mddev);
+                return 0;
+        }
+        /* if there is 1 or more failed drives and we are trying
+         * to resync, then assert that we are finished, because there is
+         * nothing we can do.
+         */
+        if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+                int rv = (mddev->size << 1) - sector_nr;
+                md_done_sync(mddev, rv, 1);
+                return rv;
+        }
+        x = sector_nr;
+        chunk_offset = sector_div(x, sectors_per_chunk);
+        stripe = x;
+        BUG_ON(x != stripe);
+        first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
+                + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+        sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
+        if (sh == NULL) {
+                sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
+                /* make sure we don't swamp the stripe cache if someone else
+                 * is trying to get access 
+                 */
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                schedule_timeout(1);
+        }
+        spin_lock(&sh->lock);   
+        set_bit(STRIPE_SYNCING, &sh->state);
+        clear_bit(STRIPE_INSYNC, &sh->state);
+        spin_unlock(&sh->lock);
+        handle_stripe(sh);
+        release_stripe(sh);
+        return STRIPE_SECTORS;
+}
+/*
+ * This is our raid5 kernel thread.
+ *
+ * We scan the hash table for stripes which can be handled now.
+ * During the scan, completed stripes are saved for us by the interrupt
+ * handler, so that they will not have to wait for our next wakeup.
+ */
+static void raid5d (mddev_t *mddev)
+{
+        struct stripe_head *sh;
+        raid5_conf_t *conf = mddev_to_conf(mddev);
+        int handled;
+        PRINTK("+++ raid5d active\n");
+        md_check_recovery(mddev);
+        md_handle_safemode(mddev);
+        handled = 0;
+        spin_lock_irq(&conf->device_lock);
+        while (1) {
+                struct list_head *first;
+                if (list_empty(&conf->handle_list) &&
+                    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
+                    !blk_queue_plugged(mddev->queue) &&
+                    !list_empty(&conf->delayed_list))
+                        raid5_activate_delayed(conf);
+                if (list_empty(&conf->handle_list))
+                        break;
+                first = conf->handle_list.next;
+                sh = list_entry(first, struct stripe_head, lru);
+                list_del_init(first);
+                atomic_inc(&sh->count);
+                if (atomic_read(&sh->count)!= 1)
+                        BUG();
+                spin_unlock_irq(&conf->device_lock);
+                
+                handled++;
+                handle_stripe(sh);
+                release_stripe(sh);
+                spin_lock_irq(&conf->device_lock);
+        }
+        PRINTK("%d stripes handled\n", handled);
+        spin_unlock_irq(&conf->device_lock);
+        unplug_slaves(mddev);
+        PRINTK("--- raid5d inactive\n");
+}
+static int run (mddev_t *mddev)
+{
+        raid5_conf_t *conf;
+        int raid_disk, memory;
+        mdk_rdev_t *rdev;
+        struct disk_info *disk;
+        struct list_head *tmp;
+        if (mddev->level != 5 && mddev->level != 4) {
+                printk("raid5: %s: raid level not set to 4/5 (%d)\n", mdname(mddev), mddev->level);
+                return -EIO;
+        }
+        mddev->private = kmalloc (sizeof (raid5_conf_t)
+                                  + mddev->raid_disks * sizeof(struct disk_info),
+                                  GFP_KERNEL);
+        if ((conf = mddev->private) == NULL)
+                goto abort;
+        memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) );
+        conf->mddev = mddev;
+        if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
+                goto abort;
+        memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
+        spin_lock_init(&conf->device_lock);
+        init_waitqueue_head(&conf->wait_for_stripe);
+        init_waitqueue_head(&conf->wait_for_overlap);
+        INIT_LIST_HEAD(&conf->handle_list);
+        INIT_LIST_HEAD(&conf->delayed_list);
+        INIT_LIST_HEAD(&conf->inactive_list);
+        atomic_set(&conf->active_stripes, 0);
+        atomic_set(&conf->preread_active_stripes, 0);
+        mddev->queue->unplug_fn = raid5_unplug_device;
+        mddev->queue->issue_flush_fn = raid5_issue_flush;
+        PRINTK("raid5: run(%s) called.\n", mdname(mddev));
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                raid_disk = rdev->raid_disk;
+                if (raid_disk >= mddev->raid_disks
+                    || raid_disk < 0)
+                        continue;
+                disk = conf->disks + raid_disk;
+                disk->rdev = rdev;
+                if (rdev->in_sync) {
+                        char b[BDEVNAME_SIZE];
+                        printk(KERN_INFO "raid5: device %s operational as raid"
+                                " disk %d\n", bdevname(rdev->bdev,b),
+                                raid_disk);
+                        conf->working_disks++;
+                }
+        }
+        conf->raid_disks = mddev->raid_disks;
+        /*
+         * 0 for a fully functional array, 1 for a degraded array.
+         */
+        mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
+        conf->mddev = mddev;
+        conf->chunk_size = mddev->chunk_size;
+        conf->level = mddev->level;
+        conf->algorithm = mddev->layout;
+        conf->max_nr_stripes = NR_STRIPES;
+        /* device size must be a multiple of chunk size */
+        mddev->size &= ~(mddev->chunk_size/1024 -1);
+        if (!conf->chunk_size || conf->chunk_size % 4) {
+                printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
+                        conf->chunk_size, mdname(mddev));
+                goto abort;
+        }
+        if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
+                printk(KERN_ERR 
+                        "raid5: unsupported parity algorithm %d for %s\n",
+                        conf->algorithm, mdname(mddev));
+                goto abort;
+        }
+        if (mddev->degraded > 1) {
+                printk(KERN_ERR "raid5: not enough operational devices for %s"
+                        " (%d/%d failed)\n",
+                        mdname(mddev), conf->failed_disks, conf->raid_disks);
+                goto abort;
+        }
+        if (mddev->degraded == 1 &&
+            mddev->recovery_cp != MaxSector) {
+                printk(KERN_ERR 
+                        "raid5: cannot start dirty degraded array for %s\n",
+                        mdname(mddev));
+                goto abort;
+        }
+        {
+                mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
+                if (!mddev->thread) {
+                        printk(KERN_ERR 
+                                "raid5: couldn't allocate thread for %s\n",
+                                mdname(mddev));
+                        goto abort;
+                }
+        }
+memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+                 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
+        if (grow_stripes(conf, conf->max_nr_stripes)) {
+                printk(KERN_ERR 
+                        "raid5: couldn't allocate %dkB for buffers\n", memory);
+                shrink_stripes(conf);
+                md_unregister_thread(mddev->thread);
+                goto abort;
+        } else
+                printk(KERN_INFO "raid5: allocated %dkB for %s\n",
+                        memory, mdname(mddev));
+        if (mddev->degraded == 0)
+                printk("raid5: raid level %d set %s active with %d out of %d"
+                        " devices, algorithm %d\n", conf->level, mdname(mddev), 
+                        mddev->raid_disks-mddev->degraded, mddev->raid_disks,
+                        conf->algorithm);
+        else
+                printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
+                        " out of %d devices, algorithm %d\n", conf->level,
+                        mdname(mddev), mddev->raid_disks - mddev->degraded,
+                        mddev->raid_disks, conf->algorithm);
+        print_raid5_conf(conf);
+        /* read-ahead size must cover two whole stripes, which is
+         * 2 * (n-1) * chunksize where 'n' is the number of raid devices
+         */
+        {
+                int stripe = (mddev->raid_disks-1) * mddev->chunk_size
+                        / PAGE_CACHE_SIZE;
+                if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+                        mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+        }
+        /* Ok, everything is just fine now */
+        mddev->array_size =  mddev->size * (mddev->raid_disks - 1);
+        return 0;
+abort:
+        if (conf) {
+                print_raid5_conf(conf);
+                if (conf->stripe_hashtbl)
+                        free_pages((unsigned long) conf->stripe_hashtbl,
+                                                        HASH_PAGES_ORDER);
+                kfree(conf);
+        }
+        mddev->private = NULL;
+        printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
+        return -EIO;
+}
+static int stop (mddev_t *mddev)
+{
+        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+        md_unregister_thread(mddev->thread);
+        mddev->thread = NULL;
+        shrink_stripes(conf);
+        free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
+        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+        kfree(conf);
+        mddev->private = NULL;
+        return 0;
+}
+#if RAID5_DEBUG
+static void print_sh (struct stripe_head *sh)
+{
+        int i;
+        printk("sh %llu, pd_idx %d, state %ld.\n",
+                (unsigned long long)sh->sector, sh->pd_idx, sh->state);
+        printk("sh %llu,  count %d.\n",
+                (unsigned long long)sh->sector, atomic_read(&sh->count));
+        printk("sh %llu, ", (unsigned long long)sh->sector);
+        for (i = 0; i < sh->raid_conf->raid_disks; i++) {
+                printk("(cache%d: %p %ld) ", 
+                        i, sh->dev[i].page, sh->dev[i].flags);
+        }
+        printk("\n");
+}
+static void printall (raid5_conf_t *conf)
+{
+        struct stripe_head *sh;
+        int i;
+        spin_lock_irq(&conf->device_lock);
+        for (i = 0; i < NR_HASH; i++) {
+                sh = conf->stripe_hashtbl[i];
+                for (; sh; sh = sh->hash_next) {
+                        if (sh->raid_conf != conf)
+                                continue;
+                        print_sh(sh);
+                }
+        }
+        spin_unlock_irq(&conf->device_lock);
+}
+#endif
+static void status (struct seq_file *seq, mddev_t *mddev)
+{
+        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+        int i;
+        seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
+        seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
+        for (i = 0; i < conf->raid_disks; i++)
+                seq_printf (seq, "%s",
+                               conf->disks[i].rdev &&
+                               conf->disks[i].rdev->in_sync ? "U" : "_");
+        seq_printf (seq, "]");
+#if RAID5_DEBUG
+#define D(x) \
+        seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
+        printall(conf);
+#endif
+}
+static void print_raid5_conf (raid5_conf_t *conf)
+{
+        int i;
+        struct disk_info *tmp;
+        printk("RAID5 conf printout:\n");
+        if (!conf) {
+                printk("(conf==NULL)\n");
+                return;
+        }
+        printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
+                 conf->working_disks, conf->failed_disks);
+        for (i = 0; i < conf->raid_disks; i++) {
+                char b[BDEVNAME_SIZE];
+                tmp = conf->disks + i;
+                if (tmp->rdev)
+                printk(" disk %d, o:%d, dev:%s\n",
+                        i, !tmp->rdev->faulty,
+                        bdevname(tmp->rdev->bdev,b));
+        }
+}
+static int raid5_spare_active(mddev_t *mddev)
+{
+        int i;
+        raid5_conf_t *conf = mddev->private;
+        struct disk_info *tmp;
+        for (i = 0; i < conf->raid_disks; i++) {
+                tmp = conf->disks + i;
+                if (tmp->rdev
+                    && !tmp->rdev->faulty
+                    && !tmp->rdev->in_sync) {
+                        mddev->degraded--;
+                        conf->failed_disks--;
+                        conf->working_disks++;
+                        tmp->rdev->in_sync = 1;
+                }
+        }
+        print_raid5_conf(conf);
+        return 0;
+}
+static int raid5_remove_disk(mddev_t *mddev, int number)
+{
+        raid5_conf_t *conf = mddev->private;
+        int err = 0;
+        mdk_rdev_t *rdev;
+        struct disk_info *p = conf->disks + number;
+        print_raid5_conf(conf);
+        rdev = p->rdev;
+        if (rdev) {
+                if (rdev->in_sync ||
+                    atomic_read(&rdev->nr_pending)) {
+                        err = -EBUSY;
+                        goto abort;
+                }
+                p->rdev = NULL;
+                synchronize_kernel();
+                if (atomic_read(&rdev->nr_pending)) {
+                        /* lost the race, try later */
+                        err = -EBUSY;
+                        p->rdev = rdev;
+                }
+        }
+abort:
+        print_raid5_conf(conf);
+        return err;
+}
+static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        raid5_conf_t *conf = mddev->private;
+        int found = 0;
+        int disk;
+        struct disk_info *p;
+        if (mddev->degraded > 1)
+                /* no point adding a device */
+                return 0;
+        /*
+         * find the disk ...
+         */
+        for (disk=0; disk < mddev->raid_disks; disk++)
+                if ((p=conf->disks + disk)->rdev == NULL) {
+                        rdev->in_sync = 0;
+                        rdev->raid_disk = disk;
+                        found = 1;
+                        p->rdev = rdev;
+                        break;
+                }
+        print_raid5_conf(conf);
+        return found;
+}
+static int raid5_resize(mddev_t *mddev, sector_t sectors)
+{
+        /* no resync is happening, and there is enough space
+         * on all devices, so we can resize.
+         * We need to make sure resync covers any new space.
+         * If the array is shrinking we should possibly wait until
+         * any io in the removed space completes, but it hardly seems
+         * worth it.
+         */
+        sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+        mddev->array_size = (sectors * (mddev->raid_disks-1))>>1;
+        set_capacity(mddev->gendisk, mddev->array_size << 1);
+        mddev->changed = 1;
+        if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
+                mddev->recovery_cp = mddev->size << 1;
+                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+        }
+        mddev->size = sectors /2;
+        return 0;
+}
+static mdk_personality_t raid5_personality=
+{
+        .name           = "raid5",
+        .owner          = THIS_MODULE,
+        .make_request   = make_request,
+        .run            = run,
+        .stop           = stop,
+        .status         = status,
+        .error_handler  = error,
+        .hot_add_disk   = raid5_add_disk,
+        .hot_remove_disk= raid5_remove_disk,
+        .spare_active   = raid5_spare_active,
+        .sync_request   = sync_request,
+        .resize         = raid5_resize,
+};
+static int __init raid5_init (void)
+{
+        return register_md_personality (RAID5, &raid5_personality);
+}
+static void raid5_exit (void)
+{
+        unregister_md_personality (RAID5);
+}
+module_init(raid5_init);
+module_exit(raid5_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-4"); /* RAID5 */
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h
new file mode 100644
index 000000000000..f80ee6350edf
--- /dev/null
+++ b/drivers/md/raid6.h
@@ -0,0 +1,135 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2003 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+#ifndef LINUX_RAID_RAID6_H
+#define LINUX_RAID_RAID6_H
+#ifdef __KERNEL__
+/* Set to 1 to use kernel-wide empty_zero_page */
+#define RAID6_USE_EMPTY_ZERO_PAGE 0
+#include <linux/module.h>
+#include <linux/stddef.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mempool.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/raid/md.h>
+#include <linux/raid/raid5.h>
+typedef raid5_conf_t raid6_conf_t; /* Same configuration */
+/* Additional compute_parity mode -- updates the parity w/o LOCKING */
+#define UPDATE_PARITY   4
+/* We need a pre-zeroed page... if we don't want to use the kernel-provided
+   one define it here */
+#if RAID6_USE_EMPTY_ZERO_PAGE
+# define raid6_empty_zero_page empty_zero_page
+#else
+extern const char raid6_empty_zero_page[PAGE_SIZE];
+#endif
+#else /* ! __KERNEL__ */
+/* Used for testing in user space */
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stddef.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+/* Not standard, but glibc defines it */
+#define BITS_PER_LONG __WORDSIZE
+typedef uint8_t  u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+extern const char raid6_empty_zero_page[PAGE_SIZE];
+#define __init
+#define __exit
+#define __attribute_const__ __attribute__((const))
+#define preempt_enable()
+#define preempt_disable()
+#endif /* __KERNEL__ */
+/* Routine choices */
+struct raid6_calls {
+        void (*gen_syndrome)(int, size_t, void **);
+        int  (*valid)(void);    /* Returns 1 if this routine set is usable */
+        const char *name;       /* Name of this routine set */
+        int prefer;             /* Has special performance attribute */
+};
+/* Selected algorithm */
+extern struct raid6_calls raid6_call;
+/* Algorithm list */
+extern const struct raid6_calls * const raid6_algos[];
+int raid6_select_algo(void);
+/* Return values from chk_syndrome */
+#define RAID6_OK        0
+#define RAID6_P_BAD     1
+#define RAID6_Q_BAD     2
+#define RAID6_PQ_BAD    3
+/* Galois field tables */
+extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
+extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
+extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
+extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
+/* Recovery routines */
+void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
+void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
+void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
+/* Some definitions to allow code to be compiled for testing in userspace */
+#ifndef __KERNEL__
+# define jiffies        raid6_jiffies()
+# define printk         printf
+# define GFP_KERNEL     0
+# define __get_free_pages(x,y)  ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0))
+# define free_pages(x,y)        munmap((void *)(x), (y)*PAGE_SIZE)
+static inline void cpu_relax(void)
+{
+        /* Nothing */
+}
+#undef  HZ
+#define HZ 1000
+static inline uint32_t raid6_jiffies(void)
+{
+        struct timeval tv;
+        gettimeofday(&tv, NULL);
+        return tv.tv_sec*1000 + tv.tv_usec/1000;
+}
+#endif /* ! __KERNEL__ */
+#endif /* LINUX_RAID_RAID6_H */
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
new file mode 100644
index 000000000000..acf386fc4b4f
--- /dev/null
+++ b/drivers/md/raid6algos.c
@@ -0,0 +1,153 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * raid6algos.c
+ *
+ * Algorithm list and algorithm selection for RAID-6
+ */
+#include "raid6.h"
+#ifndef __KERNEL__
+#include <sys/mman.h>
+#endif
+struct raid6_calls raid6_call;
+/* Various routine sets */
+extern const struct raid6_calls raid6_intx1;
+extern const struct raid6_calls raid6_intx2;
+extern const struct raid6_calls raid6_intx4;
+extern const struct raid6_calls raid6_intx8;
+extern const struct raid6_calls raid6_intx16;
+extern const struct raid6_calls raid6_intx32;
+extern const struct raid6_calls raid6_mmxx1;
+extern const struct raid6_calls raid6_mmxx2;
+extern const struct raid6_calls raid6_sse1x1;
+extern const struct raid6_calls raid6_sse1x2;
+extern const struct raid6_calls raid6_sse2x1;
+extern const struct raid6_calls raid6_sse2x2;
+extern const struct raid6_calls raid6_sse2x4;
+extern const struct raid6_calls raid6_altivec1;
+extern const struct raid6_calls raid6_altivec2;
+extern const struct raid6_calls raid6_altivec4;
+extern const struct raid6_calls raid6_altivec8;
+const struct raid6_calls * const raid6_algos[] = {
+        &raid6_intx1,
+        &raid6_intx2,
+        &raid6_intx4,
+        &raid6_intx8,
+#if defined(__ia64__)
+        &raid6_intx16,
+        &raid6_intx32,
+#endif
+#if defined(__i386__)
+        &raid6_mmxx1,
+        &raid6_mmxx2,
+        &raid6_sse1x1,
+        &raid6_sse1x2,
+        &raid6_sse2x1,
+        &raid6_sse2x2,
+#endif
+#if defined(__x86_64__)
+        &raid6_sse2x1,
+        &raid6_sse2x2,
+        &raid6_sse2x4,
+#endif
+#ifdef CONFIG_ALTIVEC
+        &raid6_altivec1,
+        &raid6_altivec2,
+        &raid6_altivec4,
+        &raid6_altivec8,
+#endif
+        NULL
+};
+#ifdef __KERNEL__
+#define RAID6_TIME_JIFFIES_LG2  4
+#else
+/* Need more time to be stable in userspace */
+#define RAID6_TIME_JIFFIES_LG2  9
+#endif
+/* Try to pick the best algorithm */
+/* This code uses the gfmul table as convenient data set to abuse */
+int __init raid6_select_algo(void)
+{
+        const struct raid6_calls * const * algo;
+        const struct raid6_calls * best;
+        char *syndromes;
+        void *dptrs[(65536/PAGE_SIZE)+2];
+        int i, disks;
+        unsigned long perf, bestperf;
+        int bestprefer;
+        unsigned long j0, j1;
+        disks = (65536/PAGE_SIZE)+2;
+        for ( i = 0 ; i < disks-2 ; i++ ) {
+                dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
+        }
+        /* Normal code - use a 2-page allocation to avoid D$ conflict */
+        syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
+        if ( !syndromes ) {
+                printk("raid6: Yikes!  No memory available.\n");
+                return -ENOMEM;
+        }
+        dptrs[disks-2] = syndromes;
+        dptrs[disks-1] = syndromes + PAGE_SIZE;
+        bestperf = 0;  bestprefer = 0;  best = NULL;
+        for ( algo = raid6_algos ; *algo ; algo++ ) {
+                if ( !(*algo)->valid || (*algo)->valid() ) {
+                        perf = 0;
+                        preempt_disable();
+                        j0 = jiffies;
+                        while ( (j1 = jiffies) == j0 )
+                                cpu_relax();
+                        while ( (jiffies-j1) < (1 << RAID6_TIME_JIFFIES_LG2) ) {
+                                (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs);
+                                perf++;
+                        }
+                        preempt_enable();
+                        if ( (*algo)->prefer > bestprefer ||
+                             ((*algo)->prefer == bestprefer &&
+                              perf > bestperf) ) {
+                                best = *algo;
+                                bestprefer = best->prefer;
+                                bestperf = perf;
+                        }
+                        printk("raid6: %-8s %5ld MB/s\n", (*algo)->name,
+                               (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
+                }
+        }
+        if ( best )
+                printk("raid6: using algorithm %s (%ld MB/s)\n",
+                       best->name,
+                       (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
+        else
+                printk("raid6: Yikes!  No algorithm found!\n");
+        raid6_call = *best;
+        free_pages((unsigned long)syndromes, 1);
+        return best ? 0 : -EINVAL;
+}
diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc
new file mode 100644
index 000000000000..1de8f030eee0
--- /dev/null
+++ b/drivers/md/raid6altivec.uc
@@ -0,0 +1,122 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * raid6altivec$#.c
+ *
+ * $#-way unrolled portable integer math RAID-6 instruction set
+ *
+ * This file is postprocessed using unroll.pl
+ *
+ * <benh> hpa: in process,
+ * you can just "steal" the vec unit with enable_kernel_altivec() (but
+ * bracked this with preempt_disable/enable or in a lock)
+ */
+#include "raid6.h"
+#ifdef CONFIG_ALTIVEC
+#include <altivec.h>
+#include <asm/system.h>
+#include <asm/cputable.h>
+/*
+ * This is the C data type to use
+ */
+typedef vector unsigned char unative_t;
+#define NBYTES(x) ((vector unsigned char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
+#define NSIZE   sizeof(unative_t)
+/*
+ * The SHLBYTE() operation shifts each byte left by 1, *not*
+ * rolling over into the next byte
+ */
+static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
+{
+        return vec_add(v,v);
+}
+/*
+ * The MASK() operation returns 0xFF in any byte for which the high
+ * bit is 1, 0x00 for any byte for which the high bit is 0.
+ */
+static inline __attribute_const__ unative_t MASK(unative_t v)
+{
+        unative_t zv = NBYTES(0);
+        /* vec_cmpgt returns a vector bool char; thus the need for the cast */
+        return (unative_t)vec_cmpgt(zv, v);
+}
+/* This is noinline to make damned sure that gcc doesn't move any of the
+   Altivec code around the enable/disable code */
+static void noinline
+raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs)
+{
+        u8 **dptr = (u8 **)ptrs;
+        u8 *p, *q;
+        int d, z, z0;
+        unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+        unative_t x1d = NBYTES(0x1d);
+        z0 = disks - 3;         /* Highest data disk */
+        p = dptr[z0+1];         /* XOR parity */
+        q = dptr[z0+2];         /* RS syndrome */
+        for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+                wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
+                for ( z = z0-1 ; z >= 0 ; z-- ) {
+                        wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+                        wp$$ = vec_xor(wp$$, wd$$);
+                        w2$$ = MASK(wq$$);
+                        w1$$ = SHLBYTE(wq$$);
+                        w2$$ = vec_and(w2$$, x1d);
+                        w1$$ = vec_xor(w1$$, w2$$);
+                        wq$$ = vec_xor(w1$$, wd$$);
+                }
+                *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+                *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+        }
+}
+static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+        preempt_disable();
+        enable_kernel_altivec();
+        raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs);
+        preempt_enable();
+}
+int raid6_have_altivec(void);
+#if $# == 1
+int raid6_have_altivec(void)
+{
+        /* This assumes either all CPUs have Altivec or none does */
+        return cpu_has_feature(CPU_FTR_ALTIVEC);
+}
+#endif
+const struct raid6_calls raid6_altivec$# = {
+        raid6_altivec$#_gen_syndrome,
+        raid6_have_altivec,
+        "altivecx$#",
+        0
+};
+#endif /* CONFIG_ALTIVEC */
diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc
new file mode 100644
index 000000000000..ad004cee0e26
--- /dev/null
+++ b/drivers/md/raid6int.uc
@@ -0,0 +1,117 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * raid6int$#.c
+ *
+ * $#-way unrolled portable integer math RAID-6 instruction set
+ *
+ * This file is postprocessed using unroll.pl
+ */
+#include "raid6.h"
+/*
+ * This is the C data type to use
+ */
+/* Change this from BITS_PER_LONG if there is something better... */
+#if BITS_PER_LONG == 64
+# define NBYTES(x) ((x) * 0x0101010101010101UL)
+# define NSIZE  8
+# define NSHIFT 3
+# define NSTRING "64"
+typedef u64 unative_t;
+#else
+# define NBYTES(x) ((x) * 0x01010101U)
+# define NSIZE  4
+# define NSHIFT 2
+# define NSTRING "32"
+typedef u32 unative_t;
+#endif
+/*
+ * IA-64 wants insane amounts of unrolling.  On other architectures that
+ * is just a waste of space.
+ */
+#if ($# <= 8) || defined(__ia64__)
+/*
+ * These sub-operations are separate inlines since they can sometimes be
+ * specially optimized using architecture-specific hacks.
+ */
+/*
+ * The SHLBYTE() operation shifts each byte left by 1, *not*
+ * rolling over into the next byte
+ */
+static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
+{
+        unative_t vv;
+        vv = (v << 1) & NBYTES(0xfe);
+        return vv;
+}
+/*
+ * The MASK() operation returns 0xFF in any byte for which the high
+ * bit is 1, 0x00 for any byte for which the high bit is 0.
+ */
+static inline __attribute_const__ unative_t MASK(unative_t v)
+{
+        unative_t vv;
+        vv = v & NBYTES(0x80);
+        vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */
+        return vv;
+}
+static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+        u8 **dptr = (u8 **)ptrs;
+        u8 *p, *q;
+        int d, z, z0;
+        unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+        z0 = disks - 3;         /* Highest data disk */
+        p = dptr[z0+1];         /* XOR parity */
+        q = dptr[z0+2];         /* RS syndrome */
+        for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+                wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
+                for ( z = z0-1 ; z >= 0 ; z-- ) {
+                        wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+                        wp$$ ^= wd$$;
+                        w2$$ = MASK(wq$$);
+                        w1$$ = SHLBYTE(wq$$);
+                        w2$$ &= NBYTES(0x1d);
+                        w1$$ ^= w2$$;
+                        wq$$ = w1$$ ^ wd$$;
+                }
+                *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+                *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+        }
+}
+const struct raid6_calls raid6_intx$# = {
+        raid6_int$#_gen_syndrome,
+        NULL,           /* always valid */
+        "int" NSTRING "x$#",
+        0
+};
+#endif
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
new file mode 100644
index 000000000000..7e30ab29691a
--- /dev/null
+++ b/drivers/md/raid6main.c
@@ -0,0 +1,2136 @@
+/*
+ * raid6main.c : Multiple Devices driver for Linux
+ *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *         Copyright (C) 1999, 2000 Ingo Molnar
+ *         Copyright (C) 2002, 2003 H. Peter Anvin
+ *
+ * RAID-6 management functions.  This code is derived from raid5.c.
+ * Last merge from raid5.c bkcvs version 1.79 (kernel 2.6.1).
+ *
+ * Thanks to Penguin Computing for making the RAID-6 development possible
+ * by donating a test server!
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <asm/atomic.h>
+#include "raid6.h"
+/*
+ * Stripe cache
+ */
+#define NR_STRIPES              256
+#define STRIPE_SIZE             PAGE_SIZE
+#define STRIPE_SHIFT            (PAGE_SHIFT - 9)
+#define STRIPE_SECTORS          (STRIPE_SIZE>>9)
+#define IO_THRESHOLD            1
+#define HASH_PAGES              1
+#define HASH_PAGES_ORDER        0
+#define NR_HASH                 (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
+#define HASH_MASK               (NR_HASH - 1)
+#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])
+/* bio's attached to a stripe+device for I/O are linked together in bi_sector
+ * order without overlap.  There may be several bio's per stripe+device, and
+ * a bio could span several devices.
+ * When walking this list for a particular stripe+device, we must never proceed
+ * beyond a bio that extends past this device, as the next bio might no longer
+ * be valid.
+ * This macro is used to determine the 'next' bio in the list, given the sector
+ * of the current stripe+device
+ */
+#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
+/*
+ * The following can be used to debug the driver
+ */
+#define RAID6_DEBUG     0       /* Extremely verbose printk */
+#define RAID6_PARANOIA  1       /* Check spinlocks */
+#define RAID6_DUMPSTATE 0       /* Include stripe cache state in /proc/mdstat */
+#if RAID6_PARANOIA && defined(CONFIG_SMP)
+# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
+#else
+# define CHECK_DEVLOCK()
+#endif
+#define PRINTK(x...) ((void)(RAID6_DEBUG && printk(KERN_DEBUG x)))
+#if RAID6_DEBUG
+#undef inline
+#undef __inline__
+#define inline
+#define __inline__
+#endif
+#if !RAID6_USE_EMPTY_ZERO_PAGE
+/* In .bss so it's zeroed */
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+#endif
+static inline int raid6_next_disk(int disk, int raid_disks)
+{
+        disk++;
+        return (disk < raid_disks) ? disk : 0;
+}
+static void print_raid6_conf (raid6_conf_t *conf);
+static inline void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh)
+{
+        if (atomic_dec_and_test(&sh->count)) {
+                if (!list_empty(&sh->lru))
+                        BUG();
+                if (atomic_read(&conf->active_stripes)==0)
+                        BUG();
+                if (test_bit(STRIPE_HANDLE, &sh->state)) {
+                        if (test_bit(STRIPE_DELAYED, &sh->state))
+                                list_add_tail(&sh->lru, &conf->delayed_list);
+                        else
+                                list_add_tail(&sh->lru, &conf->handle_list);
+                        md_wakeup_thread(conf->mddev->thread);
+                } else {
+                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+                                atomic_dec(&conf->preread_active_stripes);
+                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+                                        md_wakeup_thread(conf->mddev->thread);
+                        }
+                        list_add_tail(&sh->lru, &conf->inactive_list);
+                        atomic_dec(&conf->active_stripes);
+                        if (!conf->inactive_blocked ||
+                            atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
+                                wake_up(&conf->wait_for_stripe);
+                }
+        }
+}
+static void release_stripe(struct stripe_head *sh)
+{
+        raid6_conf_t *conf = sh->raid_conf;
+        unsigned long flags;
+        spin_lock_irqsave(&conf->device_lock, flags);
+        __release_stripe(conf, sh);
+        spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+static void remove_hash(struct stripe_head *sh)
+{
+        PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
+        if (sh->hash_pprev) {
+                if (sh->hash_next)
+                        sh->hash_next->hash_pprev = sh->hash_pprev;
+                *sh->hash_pprev = sh->hash_next;
+                sh->hash_pprev = NULL;
+        }
+}
+static __inline__ void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
+{
+        struct stripe_head **shp = &stripe_hash(conf, sh->sector);
+        PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
+        CHECK_DEVLOCK();
+        if ((sh->hash_next = *shp) != NULL)
+                (*shp)->hash_pprev = &sh->hash_next;
+        *shp = sh;
+        sh->hash_pprev = shp;
+}
+/* find an idle stripe, make sure it is unhashed, and return it. */
+static struct stripe_head *get_free_stripe(raid6_conf_t *conf)
+{
+        struct stripe_head *sh = NULL;
+        struct list_head *first;
+        CHECK_DEVLOCK();
+        if (list_empty(&conf->inactive_list))
+                goto out;
+        first = conf->inactive_list.next;
+        sh = list_entry(first, struct stripe_head, lru);
+        list_del_init(first);
+        remove_hash(sh);
+        atomic_inc(&conf->active_stripes);
+out:
+        return sh;
+}
+static void shrink_buffers(struct stripe_head *sh, int num)
+{
+        struct page *p;
+        int i;
+        for (i=0; i<num ; i++) {
+                p = sh->dev[i].page;
+                if (!p)
+                        continue;
+                sh->dev[i].page = NULL;
+                page_cache_release(p);
+        }
+}
+static int grow_buffers(struct stripe_head *sh, int num)
+{
+        int i;
+        for (i=0; i<num; i++) {
+                struct page *page;
+                if (!(page = alloc_page(GFP_KERNEL))) {
+                        return 1;
+                }
+                sh->dev[i].page = page;
+        }
+        return 0;
+}
+static void raid6_build_block (struct stripe_head *sh, int i);
+static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
+{
+        raid6_conf_t *conf = sh->raid_conf;
+        int disks = conf->raid_disks, i;
+        if (atomic_read(&sh->count) != 0)
+                BUG();
+        if (test_bit(STRIPE_HANDLE, &sh->state))
+                BUG();
+        CHECK_DEVLOCK();
+        PRINTK("init_stripe called, stripe %llu\n",
+                (unsigned long long)sh->sector);
+        remove_hash(sh);
+        sh->sector = sector;
+        sh->pd_idx = pd_idx;
+        sh->state = 0;
+        for (i=disks; i--; ) {
+                struct r5dev *dev = &sh->dev[i];
+                if (dev->toread || dev->towrite || dev->written ||
+                    test_bit(R5_LOCKED, &dev->flags)) {
+                        PRINTK("sector=%llx i=%d %p %p %p %d\n",
+                               (unsigned long long)sh->sector, i, dev->toread,
+                               dev->towrite, dev->written,
+                               test_bit(R5_LOCKED, &dev->flags));
+                        BUG();
+                }
+                dev->flags = 0;
+                raid6_build_block(sh, i);
+        }
+        insert_hash(conf, sh);
+}
+static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
+{
+        struct stripe_head *sh;
+        CHECK_DEVLOCK();
+        PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
+        for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
+                if (sh->sector == sector)
+                        return sh;
+        PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
+        return NULL;
+}
+static void unplug_slaves(mddev_t *mddev);
+static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector,
+                                             int pd_idx, int noblock)
+{
+        struct stripe_head *sh;
+        PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
+        spin_lock_irq(&conf->device_lock);
+        do {
+                sh = __find_stripe(conf, sector);
+                if (!sh) {
+                        if (!conf->inactive_blocked)
+                                sh = get_free_stripe(conf);
+                        if (noblock && sh == NULL)
+                                break;
+                        if (!sh) {
+                                conf->inactive_blocked = 1;
+                                wait_event_lock_irq(conf->wait_for_stripe,
+                                                    !list_empty(&conf->inactive_list) &&
+                                                    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
+                                                     || !conf->inactive_blocked),
+                                                    conf->device_lock,
+                                                    unplug_slaves(conf->mddev);
+                                        );
+                                conf->inactive_blocked = 0;
+                        } else
+                                init_stripe(sh, sector, pd_idx);
+                } else {
+                        if (atomic_read(&sh->count)) {
+                                if (!list_empty(&sh->lru))
+                                        BUG();
+                        } else {
+                                if (!test_bit(STRIPE_HANDLE, &sh->state))
+                                        atomic_inc(&conf->active_stripes);
+                                if (list_empty(&sh->lru))
+                                        BUG();
+                                list_del_init(&sh->lru);
+                        }
+                }
+        } while (sh == NULL);
+        if (sh)
+                atomic_inc(&sh->count);
+        spin_unlock_irq(&conf->device_lock);
+        return sh;
+}
+static int grow_stripes(raid6_conf_t *conf, int num)
+{
+        struct stripe_head *sh;
+        kmem_cache_t *sc;
+        int devs = conf->raid_disks;
+        sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev));
+        sc = kmem_cache_create(conf->cache_name,
+                               sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
+                               0, 0, NULL, NULL);
+        if (!sc)
+                return 1;
+        conf->slab_cache = sc;
+        while (num--) {
+                sh = kmem_cache_alloc(sc, GFP_KERNEL);
+                if (!sh)
+                        return 1;
+                memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
+                sh->raid_conf = conf;
+                spin_lock_init(&sh->lock);
+                if (grow_buffers(sh, conf->raid_disks)) {
+                        shrink_buffers(sh, conf->raid_disks);
+                        kmem_cache_free(sc, sh);
+                        return 1;
+                }
+                /* we just created an active stripe so... */
+                atomic_set(&sh->count, 1);
+                atomic_inc(&conf->active_stripes);
+                INIT_LIST_HEAD(&sh->lru);
+                release_stripe(sh);
+        }
+        return 0;
+}
+static void shrink_stripes(raid6_conf_t *conf)
+{
+        struct stripe_head *sh;
+        while (1) {
+                spin_lock_irq(&conf->device_lock);
+                sh = get_free_stripe(conf);
+                spin_unlock_irq(&conf->device_lock);
+                if (!sh)
+                        break;
+                if (atomic_read(&sh->count))
+                        BUG();
+                shrink_buffers(sh, conf->raid_disks);
+                kmem_cache_free(conf->slab_cache, sh);
+                atomic_dec(&conf->active_stripes);
+        }
+        kmem_cache_destroy(conf->slab_cache);
+        conf->slab_cache = NULL;
+}
+static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done,
+                                   int error)
+{
+        struct stripe_head *sh = bi->bi_private;
+        raid6_conf_t *conf = sh->raid_conf;
+        int disks = conf->raid_disks, i;
+        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+        if (bi->bi_size)
+                return 1;
+        for (i=0 ; i<disks; i++)
+                if (bi == &sh->dev[i].req)
+                        break;
+        PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
+                (unsigned long long)sh->sector, i, atomic_read(&sh->count),
+                uptodate);
+        if (i == disks) {
+                BUG();
+                return 0;
+        }
+        if (uptodate) {
+#if 0
+                struct bio *bio;
+                unsigned long flags;
+                spin_lock_irqsave(&conf->device_lock, flags);
+                /* we can return a buffer if we bypassed the cache or
+                 * if the top buffer is not in highmem.  If there are
+                 * multiple buffers, leave the extra work to
+                 * handle_stripe
+                 */
+                buffer = sh->bh_read[i];
+                if (buffer &&
+                    (!PageHighMem(buffer->b_page)
+                     || buffer->b_page == bh->b_page )
+                        ) {
+                        sh->bh_read[i] = buffer->b_reqnext;
+                        buffer->b_reqnext = NULL;
+                } else
+                        buffer = NULL;
+                spin_unlock_irqrestore(&conf->device_lock, flags);
+                if (sh->bh_page[i]==bh->b_page)
+                        set_buffer_uptodate(bh);
+                if (buffer) {
+                        if (buffer->b_page != bh->b_page)
+                                memcpy(buffer->b_data, bh->b_data, bh->b_size);
+                        buffer->b_end_io(buffer, 1);
+                }
+#else
+                set_bit(R5_UPTODATE, &sh->dev[i].flags);
+#endif
+        } else {
+                md_error(conf->mddev, conf->disks[i].rdev);
+                clear_bit(R5_UPTODATE, &sh->dev[i].flags);
+        }
+        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
+#if 0
+        /* must restore b_page before unlocking buffer... */
+        if (sh->bh_page[i] != bh->b_page) {
+                bh->b_page = sh->bh_page[i];
+                bh->b_data = page_address(bh->b_page);
+                clear_buffer_uptodate(bh);
+        }
+#endif
+        clear_bit(R5_LOCKED, &sh->dev[i].flags);
+        set_bit(STRIPE_HANDLE, &sh->state);
+        release_stripe(sh);
+        return 0;
+}
+static int raid6_end_write_request (struct bio *bi, unsigned int bytes_done,
+                                    int error)
+{
+        struct stripe_head *sh = bi->bi_private;
+        raid6_conf_t *conf = sh->raid_conf;
+        int disks = conf->raid_disks, i;
+        unsigned long flags;
+        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+        if (bi->bi_size)
+                return 1;
+        for (i=0 ; i<disks; i++)
+                if (bi == &sh->dev[i].req)
+                        break;
+        PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
+                (unsigned long long)sh->sector, i, atomic_read(&sh->count),
+                uptodate);
+        if (i == disks) {
+                BUG();
+                return 0;
+        }
+        spin_lock_irqsave(&conf->device_lock, flags);
+        if (!uptodate)
+                md_error(conf->mddev, conf->disks[i].rdev);
+        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
+        clear_bit(R5_LOCKED, &sh->dev[i].flags);
+        set_bit(STRIPE_HANDLE, &sh->state);
+        __release_stripe(conf, sh);
+        spin_unlock_irqrestore(&conf->device_lock, flags);
+        return 0;
+}
+static sector_t compute_blocknr(struct stripe_head *sh, int i);
+static void raid6_build_block (struct stripe_head *sh, int i)
+{
+        struct r5dev *dev = &sh->dev[i];
+        int pd_idx = sh->pd_idx;
+        int qd_idx = raid6_next_disk(pd_idx, sh->raid_conf->raid_disks);
+        bio_init(&dev->req);
+        dev->req.bi_io_vec = &dev->vec;
+        dev->req.bi_vcnt++;
+        dev->req.bi_max_vecs++;
+        dev->vec.bv_page = dev->page;
+        dev->vec.bv_len = STRIPE_SIZE;
+        dev->vec.bv_offset = 0;
+        dev->req.bi_sector = sh->sector;
+        dev->req.bi_private = sh;
+        dev->flags = 0;
+        if (i != pd_idx && i != qd_idx)
+                dev->sector = compute_blocknr(sh, i);
+}
+static void error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        char b[BDEVNAME_SIZE];
+        raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
+        PRINTK("raid6: error called\n");
+        if (!rdev->faulty) {
+                mddev->sb_dirty = 1;
+                if (rdev->in_sync) {
+                        conf->working_disks--;
+                        mddev->degraded++;
+                        conf->failed_disks++;
+                        rdev->in_sync = 0;
+                        /*
+                         * if recovery was running, make sure it aborts.
+                         */
+                        set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+                }
+                rdev->faulty = 1;
+                printk (KERN_ALERT
+                        "raid6: Disk failure on %s, disabling device."
+                        " Operation continuing on %d devices\n",
+                        bdevname(rdev->bdev,b), conf->working_disks);
+        }
+}
+/*
+ * Input: a 'big' sector number,
+ * Output: index of the data and parity disk, and the sector # in them.
+ */
+static sector_t raid6_compute_sector(sector_t r_sector, unsigned int raid_disks,
+                        unsigned int data_disks, unsigned int * dd_idx,
+                        unsigned int * pd_idx, raid6_conf_t *conf)
+{
+        long stripe;
+        unsigned long chunk_number;
+        unsigned int chunk_offset;
+        sector_t new_sector;
+        int sectors_per_chunk = conf->chunk_size >> 9;
+        /* First compute the information on this sector */
+        /*
+         * Compute the chunk number and the sector offset inside the chunk
+         */
+        chunk_offset = sector_div(r_sector, sectors_per_chunk);
+        chunk_number = r_sector;
+        if ( r_sector != chunk_number ) {
+                printk(KERN_CRIT "raid6: ERROR: r_sector = %llu, chunk_number = %lu\n",
+                       (unsigned long long)r_sector, (unsigned long)chunk_number);
+                BUG();
+        }
+        /*
+         * Compute the stripe number
+         */
+        stripe = chunk_number / data_disks;
+        /*
+         * Compute the data disk and parity disk indexes inside the stripe
+         */
+        *dd_idx = chunk_number % data_disks;
+        /*
+         * Select the parity disk based on the user selected algorithm.
+         */
+        /**** FIX THIS ****/
+        switch (conf->algorithm) {
+        case ALGORITHM_LEFT_ASYMMETRIC:
+                *pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                if (*pd_idx == raid_disks-1)
+                        (*dd_idx)++;    /* Q D D D P */
+                else if (*dd_idx >= *pd_idx)
+                        (*dd_idx) += 2; /* D D P Q D */
+                break;
+        case ALGORITHM_RIGHT_ASYMMETRIC:
+                *pd_idx = stripe % raid_disks;
+                if (*pd_idx == raid_disks-1)
+                        (*dd_idx)++;    /* Q D D D P */
+                else if (*dd_idx >= *pd_idx)
+                        (*dd_idx) += 2; /* D D P Q D */
+                break;
+        case ALGORITHM_LEFT_SYMMETRIC:
+                *pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+                break;
+        case ALGORITHM_RIGHT_SYMMETRIC:
+                *pd_idx = stripe % raid_disks;
+                *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+                break;
+        default:
+                printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
+                        conf->algorithm);
+        }
+        PRINTK("raid6: chunk_number = %lu, pd_idx = %u, dd_idx = %u\n",
+               chunk_number, *pd_idx, *dd_idx);
+        /*
+         * Finally, compute the new sector number
+         */
+        new_sector = (sector_t) stripe * sectors_per_chunk + chunk_offset;
+        return new_sector;
+}
+static sector_t compute_blocknr(struct stripe_head *sh, int i)
+{
+        raid6_conf_t *conf = sh->raid_conf;
+        int raid_disks = conf->raid_disks, data_disks = raid_disks - 2;
+        sector_t new_sector = sh->sector, check;
+        int sectors_per_chunk = conf->chunk_size >> 9;
+        sector_t stripe;
+        int chunk_offset;
+        int chunk_number, dummy1, dummy2, dd_idx = i;
+        sector_t r_sector;
+        int i0 = i;
+        chunk_offset = sector_div(new_sector, sectors_per_chunk);
+        stripe = new_sector;
+        if ( new_sector != stripe ) {
+                printk(KERN_CRIT "raid6: ERROR: new_sector = %llu, stripe = %lu\n",
+                       (unsigned long long)new_sector, (unsigned long)stripe);
+                BUG();
+        }
+        switch (conf->algorithm) {
+                case ALGORITHM_LEFT_ASYMMETRIC:
+                case ALGORITHM_RIGHT_ASYMMETRIC:
+                        if (sh->pd_idx == raid_disks-1)
+                                i--;    /* Q D D D P */
+                        else if (i > sh->pd_idx)
+                                i -= 2; /* D D P Q D */
+                        break;
+                case ALGORITHM_LEFT_SYMMETRIC:
+                case ALGORITHM_RIGHT_SYMMETRIC:
+                        if (sh->pd_idx == raid_disks-1)
+                                i--; /* Q D D D P */
+                        else {
+                                /* D D P Q D */
+                                if (i < sh->pd_idx)
+                                        i += raid_disks;
+                                i -= (sh->pd_idx + 2);
+                        }
+                        break;
+                default:
+                        printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
+                                conf->algorithm);
+        }
+        PRINTK("raid6: compute_blocknr: pd_idx = %u, i0 = %u, i = %u\n", sh->pd_idx, i0, i);
+        chunk_number = stripe * data_disks + i;
+        r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
+        check = raid6_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
+        if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
+                printk(KERN_CRIT "raid6: compute_blocknr: map not correct\n");
+                return 0;
+        }
+        return r_sector;
+}
+/*
+ * Copy data between a page in the stripe cache, and one or more bion
+ * The page could align with the middle of the bio, or there could be
+ * several bion, each with several bio_vecs, which cover part of the page
+ * Multiple bion are linked together on bi_next.  There may be extras
+ * at the end of this list.  We ignore them.
+ */
+static void copy_data(int frombio, struct bio *bio,
+                     struct page *page,
+                     sector_t sector)
+{
+        char *pa = page_address(page);
+        struct bio_vec *bvl;
+        int i;
+        int page_offset;
+        if (bio->bi_sector >= sector)
+                page_offset = (signed)(bio->bi_sector - sector) * 512;
+        else
+                page_offset = (signed)(sector - bio->bi_sector) * -512;
+        bio_for_each_segment(bvl, bio, i) {
+                int len = bio_iovec_idx(bio,i)->bv_len;
+                int clen;
+                int b_offset = 0;
+                if (page_offset < 0) {
+                        b_offset = -page_offset;
+                        page_offset += b_offset;
+                        len -= b_offset;
+                }
+                if (len > 0 && page_offset + len > STRIPE_SIZE)
+                        clen = STRIPE_SIZE - page_offset;
+                else clen = len;
+                if (clen > 0) {
+                        char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
+                        if (frombio)
+                                memcpy(pa+page_offset, ba+b_offset, clen);
+                        else
+                                memcpy(ba+b_offset, pa+page_offset, clen);
+                        __bio_kunmap_atomic(ba, KM_USER0);
+                }
+                if (clen < len) /* hit end of page */
+                        break;
+                page_offset +=  len;
+        }
+}
+#define check_xor()     do {                                            \
+                           if (count == MAX_XOR_BLOCKS) {               \
+                                xor_block(count, STRIPE_SIZE, ptr);     \
+                                count = 1;                              \
+                           }                                            \
+                        } while(0)
+/* Compute P and Q syndromes */
+static void compute_parity(struct stripe_head *sh, int method)
+{
+        raid6_conf_t *conf = sh->raid_conf;
+        int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+        struct bio *chosen;
+        /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+        void *ptrs[disks];
+        qd_idx = raid6_next_disk(pd_idx, disks);
+        d0_idx = raid6_next_disk(qd_idx, disks);
+        PRINTK("compute_parity, stripe %llu, method %d\n",
+                (unsigned long long)sh->sector, method);
+        switch(method) {
+        case READ_MODIFY_WRITE:
+                BUG();          /* READ_MODIFY_WRITE N/A for RAID-6 */
+        case RECONSTRUCT_WRITE:
+                for (i= disks; i-- ;)
+                        if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
+                                chosen = sh->dev[i].towrite;
+                                sh->dev[i].towrite = NULL;
+                                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+                                        wake_up(&conf->wait_for_overlap);
+                                if (sh->dev[i].written) BUG();
+                                sh->dev[i].written = chosen;
+                        }
+                break;
+        case CHECK_PARITY:
+                BUG();          /* Not implemented yet */
+        }
+        for (i = disks; i--;)
+                if (sh->dev[i].written) {
+                        sector_t sector = sh->dev[i].sector;
+                        struct bio *wbi = sh->dev[i].written;
+                        while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+                                copy_data(1, wbi, sh->dev[i].page, sector);
+                                wbi = r5_next_bio(wbi, sector);
+                        }
+                        set_bit(R5_LOCKED, &sh->dev[i].flags);
+                        set_bit(R5_UPTODATE, &sh->dev[i].flags);
+                }
+//      switch(method) {
+//      case RECONSTRUCT_WRITE:
+//      case CHECK_PARITY:
+//      case UPDATE_PARITY:
+                /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
+                /* FIX: Is this ordering of drives even remotely optimal? */
+                count = 0;
+                i = d0_idx;
+                do {
+                        ptrs[count++] = page_address(sh->dev[i].page);
+                        if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
+                                printk("block %d/%d not uptodate on parity calc\n", i,count);
+                        i = raid6_next_disk(i, disks);
+                } while ( i != d0_idx );
+//              break;
+//      }
+        raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
+        switch(method) {
+        case RECONSTRUCT_WRITE:
+                set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+                set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
+                set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
+                set_bit(R5_LOCKED,   &sh->dev[qd_idx].flags);
+                break;
+        case UPDATE_PARITY:
+                set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+                set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
+                break;
+        }
+}
+/* Compute one missing block */
+static void compute_block_1(struct stripe_head *sh, int dd_idx)
+{
+        raid6_conf_t *conf = sh->raid_conf;
+        int i, count, disks = conf->raid_disks;
+        void *ptr[MAX_XOR_BLOCKS], *p;
+        int pd_idx = sh->pd_idx;
+        int qd_idx = raid6_next_disk(pd_idx, disks);
+        PRINTK("compute_block_1, stripe %llu, idx %d\n",
+                (unsigned long long)sh->sector, dd_idx);
+        if ( dd_idx == qd_idx ) {
+                /* We're actually computing the Q drive */
+                compute_parity(sh, UPDATE_PARITY);
+        } else {
+                ptr[0] = page_address(sh->dev[dd_idx].page);
+                memset(ptr[0], 0, STRIPE_SIZE);
+                count = 1;
+                for (i = disks ; i--; ) {
+                        if (i == dd_idx || i == qd_idx)
+                                continue;
+                        p = page_address(sh->dev[i].page);
+                        if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
+                                ptr[count++] = p;
+                        else
+                                printk("compute_block() %d, stripe %llu, %d"
+                                       " not present\n", dd_idx,
+                                       (unsigned long long)sh->sector, i);
+                        check_xor();
+                }
+                if (count != 1)
+                        xor_block(count, STRIPE_SIZE, ptr);
+                set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
+        }
+}
+/* Compute two missing blocks */
+static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
+{
+        raid6_conf_t *conf = sh->raid_conf;
+        int i, count, disks = conf->raid_disks;
+        int pd_idx = sh->pd_idx;
+        int qd_idx = raid6_next_disk(pd_idx, disks);
+        int d0_idx = raid6_next_disk(qd_idx, disks);
+        int faila, failb;
+        /* faila and failb are disk numbers relative to d0_idx */
+        /* pd_idx become disks-2 and qd_idx become disks-1 */
+        faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
+        failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
+        BUG_ON(faila == failb);
+        if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
+        PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
+               (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
+        if ( failb == disks-1 ) {
+                /* Q disk is one of the missing disks */
+                if ( faila == disks-2 ) {
+                        /* Missing P+Q, just recompute */
+                        compute_parity(sh, UPDATE_PARITY);
+                        return;
+                } else {
+                        /* We're missing D+Q; recompute D from P */
+                        compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1);
+                        compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
+                        return;
+                }
+        }
+        /* We're missing D+P or D+D; build pointer table */
+        {
+                /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+                void *ptrs[disks];
+                count = 0;
+                i = d0_idx;
+                do {
+                        ptrs[count++] = page_address(sh->dev[i].page);
+                        i = raid6_next_disk(i, disks);
+                        if (i != dd_idx1 && i != dd_idx2 &&
+                            !test_bit(R5_UPTODATE, &sh->dev[i].flags))
+                                printk("compute_2 with missing block %d/%d\n", count, i);
+                } while ( i != d0_idx );
+                if ( failb == disks-2 ) {
+                        /* We're missing D+P. */
+                        raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
+                } else {
+                        /* We're missing D+D. */
+                        raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
+                }
+                /* Both the above update both missing blocks */
+                set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
+                set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
+        }
+}
+/*
+ * Each stripe/dev can have one or more bion attached.
+ * toread/towrite point to the first in a chain.
+ * The bi_next chain must be in order.
+ */
+static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+{
+        struct bio **bip;
+        raid6_conf_t *conf = sh->raid_conf;
+        PRINTK("adding bh b#%llu to stripe s#%llu\n",
+                (unsigned long long)bi->bi_sector,
+                (unsigned long long)sh->sector);
+        spin_lock(&sh->lock);
+        spin_lock_irq(&conf->device_lock);
+        if (forwrite)
+                bip = &sh->dev[dd_idx].towrite;
+        else
+                bip = &sh->dev[dd_idx].toread;
+        while (*bip && (*bip)->bi_sector < bi->bi_sector) {
+                if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
+                        goto overlap;
+                bip = &(*bip)->bi_next;
+        }
+        if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
+                goto overlap;
+        if (*bip && bi->bi_next && (*bip) != bi->bi_next)
+                BUG();
+        if (*bip)
+                bi->bi_next = *bip;
+        *bip = bi;
+        bi->bi_phys_segments ++;
+        spin_unlock_irq(&conf->device_lock);
+        spin_unlock(&sh->lock);
+        PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
+                (unsigned long long)bi->bi_sector,
+                (unsigned long long)sh->sector, dd_idx);
+        if (forwrite) {
+                /* check if page is covered */
+                sector_t sector = sh->dev[dd_idx].sector;
+                for (bi=sh->dev[dd_idx].towrite;
+                     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
+                             bi && bi->bi_sector <= sector;
+                     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
+                        if (bi->bi_sector + (bi->bi_size>>9) >= sector)
+                                sector = bi->bi_sector + (bi->bi_size>>9);
+                }
+                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
+                        set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
+        }
+        return 1;
+ overlap:
+        set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
+        spin_unlock_irq(&conf->device_lock);
+        spin_unlock(&sh->lock);
+        return 0;
+}
+/*
+ * handle_stripe - do things to a stripe.
+ *
+ * We lock the stripe and then examine the state of various bits
+ * to see what needs to be done.
+ * Possible results:
+ *    return some read request which now have data
+ *    return some write requests which are safely on disc
+ *    schedule a read on some buffers
+ *    schedule a write of some buffers
+ *    return confirmation of parity correctness
+ *
+ * Parity calculations are done inside the stripe lock
+ * buffers are taken off read_list or write_list, and bh_cache buffers
+ * get BH_Lock set before the stripe lock is released.
+ *
+ */
+static void handle_stripe(struct stripe_head *sh)
+{
+        raid6_conf_t *conf = sh->raid_conf;
+        int disks = conf->raid_disks;
+        struct bio *return_bi= NULL;
+        struct bio *bi;
+        int i;
+        int syncing;
+        int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
+        int non_overwrite = 0;
+        int failed_num[2] = {0, 0};
+        struct r5dev *dev, *pdev, *qdev;
+        int pd_idx = sh->pd_idx;
+        int qd_idx = raid6_next_disk(pd_idx, disks);
+        int p_failed, q_failed;
+        PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
+               (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
+               pd_idx, qd_idx);
+        spin_lock(&sh->lock);
+        clear_bit(STRIPE_HANDLE, &sh->state);
+        clear_bit(STRIPE_DELAYED, &sh->state);
+        syncing = test_bit(STRIPE_SYNCING, &sh->state);
+        /* Now to look around and see what can be done */
+        for (i=disks; i--; ) {
+                mdk_rdev_t *rdev;
+                dev = &sh->dev[i];
+                clear_bit(R5_Insync, &dev->flags);
+                clear_bit(R5_Syncio, &dev->flags);
+                PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
+                        i, dev->flags, dev->toread, dev->towrite, dev->written);
+                /* maybe we can reply to a read */
+                if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
+                        struct bio *rbi, *rbi2;
+                        PRINTK("Return read for disc %d\n", i);
+                        spin_lock_irq(&conf->device_lock);
+                        rbi = dev->toread;
+                        dev->toread = NULL;
+                        if (test_and_clear_bit(R5_Overlap, &dev->flags))
+                                wake_up(&conf->wait_for_overlap);
+                        spin_unlock_irq(&conf->device_lock);
+                        while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+                                copy_data(0, rbi, dev->page, dev->sector);
+                                rbi2 = r5_next_bio(rbi, dev->sector);
+                                spin_lock_irq(&conf->device_lock);
+                                if (--rbi->bi_phys_segments == 0) {
+                                        rbi->bi_next = return_bi;
+                                        return_bi = rbi;
+                                }
+                                spin_unlock_irq(&conf->device_lock);
+                                rbi = rbi2;
+                        }
+                }
+                /* now count some things */
+                if (test_bit(R5_LOCKED, &dev->flags)) locked++;
+                if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+                if (dev->toread) to_read++;
+                if (dev->towrite) {
+                        to_write++;
+                        if (!test_bit(R5_OVERWRITE, &dev->flags))
+                                non_overwrite++;
+                }
+                if (dev->written) written++;
+                rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
+                if (!rdev || !rdev->in_sync) {
+                        if ( failed < 2 )
+                                failed_num[failed] = i;
+                        failed++;
+                } else
+                        set_bit(R5_Insync, &dev->flags);
+        }
+        PRINTK("locked=%d uptodate=%d to_read=%d"
+               " to_write=%d failed=%d failed_num=%d,%d\n",
+               locked, uptodate, to_read, to_write, failed,
+               failed_num[0], failed_num[1]);
+        /* check if the array has lost >2 devices and, if so, some requests might
+         * need to be failed
+         */
+        if (failed > 2 && to_read+to_write+written) {
+                spin_lock_irq(&conf->device_lock);
+                for (i=disks; i--; ) {
+                        /* fail all writes first */
+                        bi = sh->dev[i].towrite;
+                        sh->dev[i].towrite = NULL;
+                        if (bi) to_write--;
+                        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+                                wake_up(&conf->wait_for_overlap);
+                        while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+                                struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                                if (--bi->bi_phys_segments == 0) {
+                                        md_write_end(conf->mddev);
+                                        bi->bi_next = return_bi;
+                                        return_bi = bi;
+                                }
+                                bi = nextbi;
+                        }
+                        /* and fail all 'written' */
+                        bi = sh->dev[i].written;
+                        sh->dev[i].written = NULL;
+                        while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
+                                struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
+                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                                if (--bi->bi_phys_segments == 0) {
+                                        md_write_end(conf->mddev);
+                                        bi->bi_next = return_bi;
+                                        return_bi = bi;
+                                }
+                                bi = bi2;
+                        }
+                        /* fail any reads if this device is non-operational */
+                        if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
+                                bi = sh->dev[i].toread;
+                                sh->dev[i].toread = NULL;
+                                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+                                        wake_up(&conf->wait_for_overlap);
+                                if (bi) to_read--;
+                                while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+                                        struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+                                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                                        if (--bi->bi_phys_segments == 0) {
+                                                bi->bi_next = return_bi;
+                                                return_bi = bi;
+                                        }
+                                        bi = nextbi;
+                                }
+                        }
+                }
+                spin_unlock_irq(&conf->device_lock);
+        }
+        if (failed > 2 && syncing) {
+                md_done_sync(conf->mddev, STRIPE_SECTORS,0);
+                clear_bit(STRIPE_SYNCING, &sh->state);
+                syncing = 0;
+        }
+        /*
+         * might be able to return some write requests if the parity blocks
+         * are safe, or on a failed drive
+         */
+        pdev = &sh->dev[pd_idx];
+        p_failed = (failed >= 1 && failed_num[0] == pd_idx)
+                || (failed >= 2 && failed_num[1] == pd_idx);
+        qdev = &sh->dev[qd_idx];
+        q_failed = (failed >= 1 && failed_num[0] == qd_idx)
+                || (failed >= 2 && failed_num[1] == qd_idx);
+        if ( written &&
+             ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
+                             && !test_bit(R5_LOCKED, &pdev->flags)
+                             && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
+             ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
+                             && !test_bit(R5_LOCKED, &qdev->flags)
+                             && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
+                /* any written block on an uptodate or failed drive can be
+                 * returned.  Note that if we 'wrote' to a failed drive,
+                 * it will be UPTODATE, but never LOCKED, so we don't need
+                 * to test 'failed' directly.
+                 */
+                for (i=disks; i--; )
+                        if (sh->dev[i].written) {
+                                dev = &sh->dev[i];
+                                if (!test_bit(R5_LOCKED, &dev->flags) &&
+                                    test_bit(R5_UPTODATE, &dev->flags) ) {
+                                        /* We can return any write requests */
+                                        struct bio *wbi, *wbi2;
+                                        PRINTK("Return write for stripe %llu disc %d\n",
+                                               (unsigned long long)sh->sector, i);
+                                        spin_lock_irq(&conf->device_lock);
+                                        wbi = dev->written;
+                                        dev->written = NULL;
+                                        while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+                                                wbi2 = r5_next_bio(wbi, dev->sector);
+                                                if (--wbi->bi_phys_segments == 0) {
+                                                        md_write_end(conf->mddev);
+                                                        wbi->bi_next = return_bi;
+                                                        return_bi = wbi;
+                                                }
+                                                wbi = wbi2;
+                                        }
+                                        spin_unlock_irq(&conf->device_lock);
+                                }
+                        }
+        }
+        /* Now we might consider reading some blocks, either to check/generate
+         * parity, or to satisfy requests
+         * or to load a block that is being partially written.
+         */
+        if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
+                for (i=disks; i--;) {
+                        dev = &sh->dev[i];
+                        if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+                            (dev->toread ||
+                             (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+                             syncing ||
+                             (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
+                             (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
+                                    )
+                                ) {
+                                /* we would like to get this block, possibly
+                                 * by computing it, but we might not be able to
+                                 */
+                                if (uptodate == disks-1) {
+                                        PRINTK("Computing stripe %llu block %d\n",
+                                               (unsigned long long)sh->sector, i);
+                                        compute_block_1(sh, i);
+                                        uptodate++;
+                                } else if ( uptodate == disks-2 && failed >= 2 ) {
+                                        /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
+                                        int other;
+                                        for (other=disks; other--;) {
+                                                if ( other == i )
+                                                        continue;
+                                                if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
+                                                        break;
+                                        }
+                                        BUG_ON(other < 0);
+                                        PRINTK("Computing stripe %llu blocks %d,%d\n",
+                                               (unsigned long long)sh->sector, i, other);
+                                        compute_block_2(sh, i, other);
+                                        uptodate += 2;
+                                } else if (test_bit(R5_Insync, &dev->flags)) {
+                                        set_bit(R5_LOCKED, &dev->flags);
+                                        set_bit(R5_Wantread, &dev->flags);
+#if 0
+                                        /* if I am just reading this block and we don't have
+                                           a failed drive, or any pending writes then sidestep the cache */
+                                        if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
+                                            ! syncing && !failed && !to_write) {
+                                                sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
+                                                sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
+                                        }
+#endif
+                                        locked++;
+                                        PRINTK("Reading block %d (sync=%d)\n",
+                                                i, syncing);
+                                        if (syncing)
+                                                md_sync_acct(conf->disks[i].rdev->bdev,
+                                                             STRIPE_SECTORS);
+                                }
+                        }
+                }
+                set_bit(STRIPE_HANDLE, &sh->state);
+        }
+        /* now to consider writing and what else, if anything should be read */
+        if (to_write) {
+                int rcw=0, must_compute=0;
+                for (i=disks ; i--;) {
+                        dev = &sh->dev[i];
+                        /* Would I have to read this buffer for reconstruct_write */
+                        if (!test_bit(R5_OVERWRITE, &dev->flags)
+                            && i != pd_idx && i != qd_idx
+                            && (!test_bit(R5_LOCKED, &dev->flags)
+#if 0
+                                || sh->bh_page[i] != bh->b_page
+#endif
+                                    ) &&
+                            !test_bit(R5_UPTODATE, &dev->flags)) {
+                                if (test_bit(R5_Insync, &dev->flags)) rcw++;
+                                else {
+                                        PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
+                                        must_compute++;
+                                }
+                        }
+                }
+                PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
+                       (unsigned long long)sh->sector, rcw, must_compute);
+                set_bit(STRIPE_HANDLE, &sh->state);
+                if (rcw > 0)
+                        /* want reconstruct write, but need to get some data */
+                        for (i=disks; i--;) {
+                                dev = &sh->dev[i];
+                                if (!test_bit(R5_OVERWRITE, &dev->flags)
+                                    && !(failed == 0 && (i == pd_idx || i == qd_idx))
+                                    && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+                                    test_bit(R5_Insync, &dev->flags)) {
+                                        if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                                        {
+                                                PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
+                                                       (unsigned long long)sh->sector, i);
+                                                set_bit(R5_LOCKED, &dev->flags);
+                                                set_bit(R5_Wantread, &dev->flags);
+                                                locked++;
+                                        } else {
+                                                PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
+                                                       (unsigned long long)sh->sector, i);
+                                                set_bit(STRIPE_DELAYED, &sh->state);
+                                                set_bit(STRIPE_HANDLE, &sh->state);
+                                        }
+                                }
+                        }
+                /* now if nothing is locked, and if we have enough data, we can start a write request */
+                if (locked == 0 && rcw == 0) {
+                        if ( must_compute > 0 ) {
+                                /* We have failed blocks and need to compute them */
+                                switch ( failed ) {
+                                case 0: BUG();
+                                case 1: compute_block_1(sh, failed_num[0]); break;
+                                case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
+                                default: BUG(); /* This request should have been failed? */
+                                }
+                        }
+                        PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
+                        compute_parity(sh, RECONSTRUCT_WRITE);
+                        /* now every locked buffer is ready to be written */
+                        for (i=disks; i--;)
+                                if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
+                                        PRINTK("Writing stripe %llu block %d\n",
+                                               (unsigned long long)sh->sector, i);
+                                        locked++;
+                                        set_bit(R5_Wantwrite, &sh->dev[i].flags);
+#if 0 /**** FIX: I don't understand the logic here... ****/
+                                        if (!test_bit(R5_Insync, &sh->dev[i].flags)
+                                            || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */
+                                                set_bit(STRIPE_INSYNC, &sh->state);
+#endif
+                                }
+                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+                                atomic_dec(&conf->preread_active_stripes);
+                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+                                        md_wakeup_thread(conf->mddev->thread);
+                        }
+                }
+        }
+        /* maybe we need to check and possibly fix the parity for this stripe
+         * Any reads will already have been scheduled, so we just see if enough data
+         * is available
+         */
+        if (syncing && locked == 0 &&
+            !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) {
+                set_bit(STRIPE_HANDLE, &sh->state);
+#if 0 /* RAID-6: Don't support CHECK PARITY yet */
+                if (failed == 0) {
+                        char *pagea;
+                        if (uptodate != disks)
+                                BUG();
+                        compute_parity(sh, CHECK_PARITY);
+                        uptodate--;
+                        pagea = page_address(sh->dev[pd_idx].page);
+                        if ((*(u32*)pagea) == 0 &&
+                            !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
+                                /* parity is correct (on disc, not in buffer any more) */
+                                set_bit(STRIPE_INSYNC, &sh->state);
+                        }
+                }
+#endif
+                if (!test_bit(STRIPE_INSYNC, &sh->state)) {
+                        int failed_needupdate[2];
+                        struct r5dev *adev, *bdev;
+                        if ( failed < 1 )
+                                failed_num[0] = pd_idx;
+                        if ( failed < 2 )
+                                failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx;
+                        failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags);
+                        failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags);
+                        PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n",
+                               failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]);
+#if 0  /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */
+                        /* should be able to compute the missing block(s) and write to spare */
+                        if ( failed_needupdate[0] ^ failed_needupdate[1] ) {
+                                if (uptodate+1 != disks)
+                                        BUG();
+                                compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]);
+                                uptodate++;
+                        } else if ( failed_needupdate[0] & failed_needupdate[1] ) {
+                                if (uptodate+2 != disks)
+                                        BUG();
+                                compute_block_2(sh, failed_num[0], failed_num[1]);
+                                uptodate += 2;
+                        }
+#else
+                        compute_block_2(sh, failed_num[0], failed_num[1]);
+                        uptodate += failed_needupdate[0] + failed_needupdate[1];
+#endif
+                        if (uptodate != disks)
+                                BUG();
+                        PRINTK("Marking for sync stripe %llu blocks %d,%d\n",
+                               (unsigned long long)sh->sector, failed_num[0], failed_num[1]);
+                        /**** FIX: Should we really do both of these unconditionally? ****/
+                        adev = &sh->dev[failed_num[0]];
+                        locked += !test_bit(R5_LOCKED, &adev->flags);
+                        set_bit(R5_LOCKED, &adev->flags);
+                        set_bit(R5_Wantwrite, &adev->flags);
+                        bdev = &sh->dev[failed_num[1]];
+                        locked += !test_bit(R5_LOCKED, &bdev->flags);
+                        set_bit(R5_LOCKED, &bdev->flags);
+                        set_bit(R5_Wantwrite, &bdev->flags);
+                        set_bit(STRIPE_INSYNC, &sh->state);
+                        set_bit(R5_Syncio, &adev->flags);
+                        set_bit(R5_Syncio, &bdev->flags);
+                }
+        }
+        if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+                md_done_sync(conf->mddev, STRIPE_SECTORS,1);
+                clear_bit(STRIPE_SYNCING, &sh->state);
+        }
+        spin_unlock(&sh->lock);
+        while ((bi=return_bi)) {
+                int bytes = bi->bi_size;
+                return_bi = bi->bi_next;
+                bi->bi_next = NULL;
+                bi->bi_size = 0;
+                bi->bi_end_io(bi, bytes, 0);
+        }
+        for (i=disks; i-- ;) {
+                int rw;
+                struct bio *bi;
+                mdk_rdev_t *rdev;
+                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
+                        rw = 1;
+                else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+                        rw = 0;
+                else
+                        continue;
+                bi = &sh->dev[i].req;
+                bi->bi_rw = rw;
+                if (rw)
+                        bi->bi_end_io = raid6_end_write_request;
+                else
+                        bi->bi_end_io = raid6_end_read_request;
+                rcu_read_lock();
+                rdev = conf->disks[i].rdev;
+                if (rdev && rdev->faulty)
+                        rdev = NULL;
+                if (rdev)
+                        atomic_inc(&rdev->nr_pending);
+                rcu_read_unlock();
+                if (rdev) {
+                        if (test_bit(R5_Syncio, &sh->dev[i].flags))
+                                md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+                        bi->bi_bdev = rdev->bdev;
+                        PRINTK("for %llu schedule op %ld on disc %d\n",
+                                (unsigned long long)sh->sector, bi->bi_rw, i);
+                        atomic_inc(&sh->count);
+                        bi->bi_sector = sh->sector + rdev->data_offset;
+                        bi->bi_flags = 1 << BIO_UPTODATE;
+                        bi->bi_vcnt = 1;
+                        bi->bi_max_vecs = 1;
+                        bi->bi_idx = 0;
+                        bi->bi_io_vec = &sh->dev[i].vec;
+                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+                        bi->bi_io_vec[0].bv_offset = 0;
+                        bi->bi_size = STRIPE_SIZE;
+                        bi->bi_next = NULL;
+                        generic_make_request(bi);
+                } else {
+                        PRINTK("skip op %ld on disc %d for sector %llu\n",
+                                bi->bi_rw, i, (unsigned long long)sh->sector);
+                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
+                        set_bit(STRIPE_HANDLE, &sh->state);
+                }
+        }
+}
+static inline void raid6_activate_delayed(raid6_conf_t *conf)
+{
+        if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
+                while (!list_empty(&conf->delayed_list)) {
+                        struct list_head *l = conf->delayed_list.next;
+                        struct stripe_head *sh;
+                        sh = list_entry(l, struct stripe_head, lru);
+                        list_del_init(l);
+                        clear_bit(STRIPE_DELAYED, &sh->state);
+                        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                                atomic_inc(&conf->preread_active_stripes);
+                        list_add_tail(&sh->lru, &conf->handle_list);
+                }
+        }
+}
+static void unplug_slaves(mddev_t *mddev)
+{
+        raid6_conf_t *conf = mddev_to_conf(mddev);
+        int i;
+        rcu_read_lock();
+        for (i=0; i<mddev->raid_disks; i++) {
+                mdk_rdev_t *rdev = conf->disks[i].rdev;
+                if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
+                        request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+                        atomic_inc(&rdev->nr_pending);
+                        rcu_read_unlock();
+                        if (r_queue->unplug_fn)
+                                r_queue->unplug_fn(r_queue);
+                        rdev_dec_pending(rdev, mddev);
+                        rcu_read_lock();
+                }
+        }
+        rcu_read_unlock();
+}
+static void raid6_unplug_device(request_queue_t *q)
+{
+        mddev_t *mddev = q->queuedata;
+        raid6_conf_t *conf = mddev_to_conf(mddev);
+        unsigned long flags;
+        spin_lock_irqsave(&conf->device_lock, flags);
+        if (blk_remove_plug(q))
+                raid6_activate_delayed(conf);
+        md_wakeup_thread(mddev->thread);
+        spin_unlock_irqrestore(&conf->device_lock, flags);
+        unplug_slaves(mddev);
+}
+static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
+                             sector_t *error_sector)
+{
+        mddev_t *mddev = q->queuedata;
+        raid6_conf_t *conf = mddev_to_conf(mddev);
+        int i, ret = 0;
+        rcu_read_lock();
+        for (i=0; i<mddev->raid_disks && ret == 0; i++) {
+                mdk_rdev_t *rdev = conf->disks[i].rdev;
+                if (rdev && !rdev->faulty) {
+                        struct block_device *bdev = rdev->bdev;
+                        request_queue_t *r_queue = bdev_get_queue(bdev);
+                        if (!r_queue->issue_flush_fn)
+                                ret = -EOPNOTSUPP;
+                        else {
+                                atomic_inc(&rdev->nr_pending);
+                                rcu_read_unlock();
+                                ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
+                                                              error_sector);
+                                rdev_dec_pending(rdev, mddev);
+                                rcu_read_lock();
+                        }
+                }
+        }
+        rcu_read_unlock();
+        return ret;
+}
+static inline void raid6_plug_device(raid6_conf_t *conf)
+{
+        spin_lock_irq(&conf->device_lock);
+        blk_plug_device(conf->mddev->queue);
+        spin_unlock_irq(&conf->device_lock);
+}
+static int make_request (request_queue_t *q, struct bio * bi)
+{
+        mddev_t *mddev = q->queuedata;
+        raid6_conf_t *conf = mddev_to_conf(mddev);
+        const unsigned int raid_disks = conf->raid_disks;
+        const unsigned int data_disks = raid_disks - 2;
+        unsigned int dd_idx, pd_idx;
+        sector_t new_sector;
+        sector_t logical_sector, last_sector;
+        struct stripe_head *sh;
+        if (bio_data_dir(bi)==WRITE) {
+                disk_stat_inc(mddev->gendisk, writes);
+                disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
+        } else {
+                disk_stat_inc(mddev->gendisk, reads);
+                disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi));
+        }
+        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+        last_sector = bi->bi_sector + (bi->bi_size>>9);
+        bi->bi_next = NULL;
+        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
+        if ( bio_data_dir(bi) == WRITE )
+                md_write_start(mddev);
+        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+                DEFINE_WAIT(w);
+                new_sector = raid6_compute_sector(logical_sector,
+                                                  raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+                PRINTK("raid6: make_request, sector %llu logical %llu\n",
+                       (unsigned long long)new_sector,
+                       (unsigned long long)logical_sector);
+        retry:
+                prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+                sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
+                if (sh) {
+                        if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+                                /* Add failed due to overlap.  Flush everything
+                                 * and wait a while
+                                 */
+                                raid6_unplug_device(mddev->queue);
+                                release_stripe(sh);
+                                schedule();
+                                goto retry;
+                        }
+                        finish_wait(&conf->wait_for_overlap, &w);
+                        raid6_plug_device(conf);
+                        handle_stripe(sh);
+                        release_stripe(sh);
+                } else {
+                        /* cannot get stripe for read-ahead, just give-up */
+                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                        finish_wait(&conf->wait_for_overlap, &w);
+                        break;
+                }
+        }
+        spin_lock_irq(&conf->device_lock);
+        if (--bi->bi_phys_segments == 0) {
+                int bytes = bi->bi_size;
+                if ( bio_data_dir(bi) == WRITE )
+                        md_write_end(mddev);
+                bi->bi_size = 0;
+                bi->bi_end_io(bi, bytes, 0);
+        }
+        spin_unlock_irq(&conf->device_lock);
+        return 0;
+}
+/* FIXME go_faster isn't used */
+static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
+{
+        raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
+        struct stripe_head *sh;
+        int sectors_per_chunk = conf->chunk_size >> 9;
+        sector_t x;
+        unsigned long stripe;
+        int chunk_offset;
+        int dd_idx, pd_idx;
+        sector_t first_sector;
+        int raid_disks = conf->raid_disks;
+        int data_disks = raid_disks - 2;
+        if (sector_nr >= mddev->size <<1) {
+                /* just being told to finish up .. nothing much to do */
+                unplug_slaves(mddev);
+                return 0;
+        }
+        /* if there are 2 or more failed drives and we are trying
+         * to resync, then assert that we are finished, because there is
+         * nothing we can do.
+         */
+        if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+                int rv = (mddev->size << 1) - sector_nr;
+                md_done_sync(mddev, rv, 1);
+                return rv;
+        }
+        x = sector_nr;
+        chunk_offset = sector_div(x, sectors_per_chunk);
+        stripe = x;
+        BUG_ON(x != stripe);
+        first_sector = raid6_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
+                + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+        sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
+        if (sh == NULL) {
+                sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
+                /* make sure we don't swamp the stripe cache if someone else
+                 * is trying to get access
+                 */
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                schedule_timeout(1);
+        }
+        spin_lock(&sh->lock);
+        set_bit(STRIPE_SYNCING, &sh->state);
+        clear_bit(STRIPE_INSYNC, &sh->state);
+        spin_unlock(&sh->lock);
+        handle_stripe(sh);
+        release_stripe(sh);
+        return STRIPE_SECTORS;
+}
+/*
+ * This is our raid6 kernel thread.
+ *
+ * We scan the hash table for stripes which can be handled now.
+ * During the scan, completed stripes are saved for us by the interrupt
+ * handler, so that they will not have to wait for our next wakeup.
+ */
+static void raid6d (mddev_t *mddev)
+{
+        struct stripe_head *sh;
+        raid6_conf_t *conf = mddev_to_conf(mddev);
+        int handled;
+        PRINTK("+++ raid6d active\n");
+        md_check_recovery(mddev);
+        md_handle_safemode(mddev);
+        handled = 0;
+        spin_lock_irq(&conf->device_lock);
+        while (1) {
+                struct list_head *first;
+                if (list_empty(&conf->handle_list) &&
+                    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
+                    !blk_queue_plugged(mddev->queue) &&
+                    !list_empty(&conf->delayed_list))
+                        raid6_activate_delayed(conf);
+                if (list_empty(&conf->handle_list))
+                        break;
+                first = conf->handle_list.next;
+                sh = list_entry(first, struct stripe_head, lru);
+                list_del_init(first);
+                atomic_inc(&sh->count);
+                if (atomic_read(&sh->count)!= 1)
+                        BUG();
+                spin_unlock_irq(&conf->device_lock);
+                handled++;
+                handle_stripe(sh);
+                release_stripe(sh);
+                spin_lock_irq(&conf->device_lock);
+        }
+        PRINTK("%d stripes handled\n", handled);
+        spin_unlock_irq(&conf->device_lock);
+        unplug_slaves(mddev);
+        PRINTK("--- raid6d inactive\n");
+}
+static int run (mddev_t *mddev)
+{
+        raid6_conf_t *conf;
+        int raid_disk, memory;
+        mdk_rdev_t *rdev;
+        struct disk_info *disk;
+        struct list_head *tmp;
+        if (mddev->level != 6) {
+                PRINTK("raid6: %s: raid level not set to 6 (%d)\n", mdname(mddev), mddev->level);
+                return -EIO;
+        }
+        mddev->private = kmalloc (sizeof (raid6_conf_t)
+                                  + mddev->raid_disks * sizeof(struct disk_info),
+                                  GFP_KERNEL);
+        if ((conf = mddev->private) == NULL)
+                goto abort;
+        memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) );
+        conf->mddev = mddev;
+        if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
+                goto abort;
+        memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
+        spin_lock_init(&conf->device_lock);
+        init_waitqueue_head(&conf->wait_for_stripe);
+        init_waitqueue_head(&conf->wait_for_overlap);
+        INIT_LIST_HEAD(&conf->handle_list);
+        INIT_LIST_HEAD(&conf->delayed_list);
+        INIT_LIST_HEAD(&conf->inactive_list);
+        atomic_set(&conf->active_stripes, 0);
+        atomic_set(&conf->preread_active_stripes, 0);
+        mddev->queue->unplug_fn = raid6_unplug_device;
+        mddev->queue->issue_flush_fn = raid6_issue_flush;
+        PRINTK("raid6: run(%s) called.\n", mdname(mddev));
+        ITERATE_RDEV(mddev,rdev,tmp) {
+                raid_disk = rdev->raid_disk;
+                if (raid_disk >= mddev->raid_disks
+                    || raid_disk < 0)
+                        continue;
+                disk = conf->disks + raid_disk;
+                disk->rdev = rdev;
+                if (rdev->in_sync) {
+                        char b[BDEVNAME_SIZE];
+                        printk(KERN_INFO "raid6: device %s operational as raid"
+                               " disk %d\n", bdevname(rdev->bdev,b),
+                               raid_disk);
+                        conf->working_disks++;
+                }
+        }
+        conf->raid_disks = mddev->raid_disks;
+        /*
+         * 0 for a fully functional array, 1 or 2 for a degraded array.
+         */
+        mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
+        conf->mddev = mddev;
+        conf->chunk_size = mddev->chunk_size;
+        conf->level = mddev->level;
+        conf->algorithm = mddev->layout;
+        conf->max_nr_stripes = NR_STRIPES;
+        /* device size must be a multiple of chunk size */
+        mddev->size &= ~(mddev->chunk_size/1024 -1);
+        if (conf->raid_disks < 4) {
+                printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
+                       mdname(mddev), conf->raid_disks);
+                goto abort;
+        }
+        if (!conf->chunk_size || conf->chunk_size % 4) {
+                printk(KERN_ERR "raid6: invalid chunk size %d for %s\n",
+                       conf->chunk_size, mdname(mddev));
+                goto abort;
+        }
+        if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
+                printk(KERN_ERR
+                       "raid6: unsupported parity algorithm %d for %s\n",
+                       conf->algorithm, mdname(mddev));
+                goto abort;
+        }
+        if (mddev->degraded > 2) {
+                printk(KERN_ERR "raid6: not enough operational devices for %s"
+                       " (%d/%d failed)\n",
+                       mdname(mddev), conf->failed_disks, conf->raid_disks);
+                goto abort;
+        }
+#if 0                           /* FIX: For now */
+        if (mddev->degraded > 0 &&
+            mddev->recovery_cp != MaxSector) {
+                printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev));
+                goto abort;
+        }
+#endif
+        {
+                mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
+                if (!mddev->thread) {
+                        printk(KERN_ERR
+                               "raid6: couldn't allocate thread for %s\n",
+                               mdname(mddev));
+                        goto abort;
+                }
+        }
+        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+                 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
+        if (grow_stripes(conf, conf->max_nr_stripes)) {
+                printk(KERN_ERR
+                       "raid6: couldn't allocate %dkB for buffers\n", memory);
+                shrink_stripes(conf);
+                md_unregister_thread(mddev->thread);
+                goto abort;
+        } else
+                printk(KERN_INFO "raid6: allocated %dkB for %s\n",
+                       memory, mdname(mddev));
+        if (mddev->degraded == 0)
+                printk(KERN_INFO "raid6: raid level %d set %s active with %d out of %d"
+                       " devices, algorithm %d\n", conf->level, mdname(mddev),
+                       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
+                       conf->algorithm);
+        else
+                printk(KERN_ALERT "raid6: raid level %d set %s active with %d"
+                       " out of %d devices, algorithm %d\n", conf->level,
+                       mdname(mddev), mddev->raid_disks - mddev->degraded,
+                       mddev->raid_disks, conf->algorithm);
+        print_raid6_conf(conf);
+        /* read-ahead size must cover two whole stripes, which is
+         * 2 * (n-2) * chunksize where 'n' is the number of raid devices
+         */
+        {
+                int stripe = (mddev->raid_disks-2) * mddev->chunk_size
+                        / PAGE_CACHE_SIZE;
+                if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+                        mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+        }
+        /* Ok, everything is just fine now */
+        mddev->array_size =  mddev->size * (mddev->raid_disks - 2);
+        return 0;
+abort:
+        if (conf) {
+                print_raid6_conf(conf);
+                if (conf->stripe_hashtbl)
+                        free_pages((unsigned long) conf->stripe_hashtbl,
+                                                        HASH_PAGES_ORDER);
+                kfree(conf);
+        }
+        mddev->private = NULL;
+        printk(KERN_ALERT "raid6: failed to run raid set %s\n", mdname(mddev));
+        return -EIO;
+}
+static int stop (mddev_t *mddev)
+{
+        raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
+        md_unregister_thread(mddev->thread);
+        mddev->thread = NULL;
+        shrink_stripes(conf);
+        free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
+        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+        kfree(conf);
+        mddev->private = NULL;
+        return 0;
+}
+#if RAID6_DUMPSTATE
+static void print_sh (struct seq_file *seq, struct stripe_head *sh)
+{
+        int i;
+        seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
+                   (unsigned long long)sh->sector, sh->pd_idx, sh->state);
+        seq_printf(seq, "sh %llu,  count %d.\n",
+                   (unsigned long long)sh->sector, atomic_read(&sh->count));
+        seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
+        for (i = 0; i < sh->raid_conf->raid_disks; i++) {
+                seq_printf(seq, "(cache%d: %p %ld) ",
+                           i, sh->dev[i].page, sh->dev[i].flags);
+        }
+        seq_printf(seq, "\n");
+}
+static void printall (struct seq_file *seq, raid6_conf_t *conf)
+{
+        struct stripe_head *sh;
+        int i;
+        spin_lock_irq(&conf->device_lock);
+        for (i = 0; i < NR_HASH; i++) {
+                sh = conf->stripe_hashtbl[i];
+                for (; sh; sh = sh->hash_next) {
+                        if (sh->raid_conf != conf)
+                                continue;
+                        print_sh(seq, sh);
+                }
+        }
+        spin_unlock_irq(&conf->device_lock);
+}
+#endif
+static void status (struct seq_file *seq, mddev_t *mddev)
+{
+        raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
+        int i;
+        seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
+        seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
+        for (i = 0; i < conf->raid_disks; i++)
+                seq_printf (seq, "%s",
+                            conf->disks[i].rdev &&
+                            conf->disks[i].rdev->in_sync ? "U" : "_");
+        seq_printf (seq, "]");
+#if RAID6_DUMPSTATE
+        seq_printf (seq, "\n");
+        printall(seq, conf);
+#endif
+}
+static void print_raid6_conf (raid6_conf_t *conf)
+{
+        int i;
+        struct disk_info *tmp;
+        printk("RAID6 conf printout:\n");
+        if (!conf) {
+                printk("(conf==NULL)\n");
+                return;
+        }
+        printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
+                 conf->working_disks, conf->failed_disks);
+        for (i = 0; i < conf->raid_disks; i++) {
+                char b[BDEVNAME_SIZE];
+                tmp = conf->disks + i;
+                if (tmp->rdev)
+                printk(" disk %d, o:%d, dev:%s\n",
+                        i, !tmp->rdev->faulty,
+                        bdevname(tmp->rdev->bdev,b));
+        }
+}
+static int raid6_spare_active(mddev_t *mddev)
+{
+        int i;
+        raid6_conf_t *conf = mddev->private;
+        struct disk_info *tmp;
+        for (i = 0; i < conf->raid_disks; i++) {
+                tmp = conf->disks + i;
+                if (tmp->rdev
+                    && !tmp->rdev->faulty
+                    && !tmp->rdev->in_sync) {
+                        mddev->degraded--;
+                        conf->failed_disks--;
+                        conf->working_disks++;
+                        tmp->rdev->in_sync = 1;
+                }
+        }
+        print_raid6_conf(conf);
+        return 0;
+}
+static int raid6_remove_disk(mddev_t *mddev, int number)
+{
+        raid6_conf_t *conf = mddev->private;
+        int err = 0;
+        mdk_rdev_t *rdev;
+        struct disk_info *p = conf->disks + number;
+        print_raid6_conf(conf);
+        rdev = p->rdev;
+        if (rdev) {
+                if (rdev->in_sync ||
+                    atomic_read(&rdev->nr_pending)) {
+                        err = -EBUSY;
+                        goto abort;
+                }
+                p->rdev = NULL;
+                synchronize_kernel();
+                if (atomic_read(&rdev->nr_pending)) {
+                        /* lost the race, try later */
+                        err = -EBUSY;
+                        p->rdev = rdev;
+                }
+        }
+abort:
+        print_raid6_conf(conf);
+        return err;
+}
+static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        raid6_conf_t *conf = mddev->private;
+        int found = 0;
+        int disk;
+        struct disk_info *p;
+        if (mddev->degraded > 2)
+                /* no point adding a device */
+                return 0;
+        /*
+         * find the disk ...
+         */
+        for (disk=0; disk < mddev->raid_disks; disk++)
+                if ((p=conf->disks + disk)->rdev == NULL) {
+                        rdev->in_sync = 0;
+                        rdev->raid_disk = disk;
+                        found = 1;
+                        p->rdev = rdev;
+                        break;
+                }
+        print_raid6_conf(conf);
+        return found;
+}
+static int raid6_resize(mddev_t *mddev, sector_t sectors)
+{
+        /* no resync is happening, and there is enough space
+         * on all devices, so we can resize.
+         * We need to make sure resync covers any new space.
+         * If the array is shrinking we should possibly wait until
+         * any io in the removed space completes, but it hardly seems
+         * worth it.
+         */
+        sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+        mddev->array_size = (sectors * (mddev->raid_disks-2))>>1;
+        set_capacity(mddev->gendisk, mddev->array_size << 1);
+        mddev->changed = 1;
+        if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
+                mddev->recovery_cp = mddev->size << 1;
+                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+        }
+        mddev->size = sectors /2;
+        return 0;
+}
+static mdk_personality_t raid6_personality=
+{
+        .name           = "raid6",
+        .owner          = THIS_MODULE,
+        .make_request   = make_request,
+        .run            = run,
+        .stop           = stop,
+        .status         = status,
+        .error_handler  = error,
+        .hot_add_disk   = raid6_add_disk,
+        .hot_remove_disk= raid6_remove_disk,
+        .spare_active   = raid6_spare_active,
+        .sync_request   = sync_request,
+        .resize         = raid6_resize,
+};
+static int __init raid6_init (void)
+{
+        int e;
+        e = raid6_select_algo();
+        if ( e )
+                return e;
+        return register_md_personality (RAID6, &raid6_personality);
+}
+static void raid6_exit (void)
+{
+        unregister_md_personality (RAID6);
+}
+module_init(raid6_init);
+module_exit(raid6_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-8"); /* RAID6 */
diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c
new file mode 100644
index 000000000000..359157aaf9e0
--- /dev/null
+++ b/drivers/md/raid6mmx.c
@@ -0,0 +1,150 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * raid6mmx.c
+ *
+ * MMX implementation of RAID-6 syndrome functions
+ */
+#if defined(__i386__)
+#include "raid6.h"
+#include "raid6x86.h"
+/* Shared with raid6sse1.c */
+const struct raid6_mmx_constants {
+        u64 x1d;
+} raid6_mmx_constants = {
+        0x1d1d1d1d1d1d1d1dULL,
+};
+static int raid6_have_mmx(void)
+{
+#ifdef __KERNEL__
+        /* Not really "boot_cpu" but "all_cpus" */
+        return boot_cpu_has(X86_FEATURE_MMX);
+#else
+        /* User space test code */
+        u32 features = cpuid_features();
+        return ( (features & (1<<23)) == (1<<23) );
+#endif
+}
+/*
+ * Plain MMX implementation
+ */
+static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+        u8 **dptr = (u8 **)ptrs;
+        u8 *p, *q;
+        int d, z, z0;
+        raid6_mmx_save_t sa;
+        z0 = disks - 3;         /* Highest data disk */
+        p = dptr[z0+1];         /* XOR parity */
+        q = dptr[z0+2];         /* RS syndrome */
+        raid6_before_mmx(&sa);
+        asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
+        asm volatile("pxor %mm5,%mm5"); /* Zero temp */
+        for ( d = 0 ; d < bytes ; d += 8 ) {
+                asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
+                asm volatile("movq %mm2,%mm4"); /* Q[0] */
+                for ( z = z0-1 ; z >= 0 ; z-- ) {
+                        asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d]));
+                        asm volatile("pcmpgtb %mm4,%mm5");
+                        asm volatile("paddb %mm4,%mm4");
+                        asm volatile("pand %mm0,%mm5");
+                        asm volatile("pxor %mm5,%mm4");
+                        asm volatile("pxor %mm5,%mm5");
+                        asm volatile("pxor %mm6,%mm2");
+                        asm volatile("pxor %mm6,%mm4");
+                }
+                asm volatile("movq %%mm2,%0" : "=m" (p[d]));
+                asm volatile("pxor %mm2,%mm2");
+                asm volatile("movq %%mm4,%0" : "=m" (q[d]));
+                asm volatile("pxor %mm4,%mm4");
+        }
+        raid6_after_mmx(&sa);
+}
+const struct raid6_calls raid6_mmxx1 = {
+        raid6_mmx1_gen_syndrome,
+        raid6_have_mmx,
+        "mmxx1",
+        0
+};
+/*
+ * Unrolled-by-2 MMX implementation
+ */
+static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+        u8 **dptr = (u8 **)ptrs;
+        u8 *p, *q;
+        int d, z, z0;
+        raid6_mmx_save_t sa;
+        z0 = disks - 3;         /* Highest data disk */
+        p = dptr[z0+1];         /* XOR parity */
+        q = dptr[z0+2];         /* RS syndrome */
+        raid6_before_mmx(&sa);
+        asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
+        asm volatile("pxor %mm5,%mm5"); /* Zero temp */
+        asm volatile("pxor %mm7,%mm7"); /* Zero temp */
+        for ( d = 0 ; d < bytes ; d += 16 ) {
+                asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
+                asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8]));
+                asm volatile("movq %mm2,%mm4"); /* Q[0] */
+                asm volatile("movq %mm3,%mm6"); /* Q[1] */
+                for ( z = z0-1 ; z >= 0 ; z-- ) {
+                        asm volatile("pcmpgtb %mm4,%mm5");
+                        asm volatile("pcmpgtb %mm6,%mm7");
+                        asm volatile("paddb %mm4,%mm4");
+                        asm volatile("paddb %mm6,%mm6");
+                        asm volatile("pand %mm0,%mm5");
+                        asm volatile("pand %mm0,%mm7");
+                        asm volatile("pxor %mm5,%mm4");
+                        asm volatile("pxor %mm7,%mm6");
+                        asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d]));
+                        asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8]));
+                        asm volatile("pxor %mm5,%mm2");
+                        asm volatile("pxor %mm7,%mm3");
+                        asm volatile("pxor %mm5,%mm4");
+                        asm volatile("pxor %mm7,%mm6");
+                        asm volatile("pxor %mm5,%mm5");
+                        asm volatile("pxor %mm7,%mm7");
+                }
+                asm volatile("movq %%mm2,%0" : "=m" (p[d]));
+                asm volatile("movq %%mm3,%0" : "=m" (p[d+8]));
+                asm volatile("movq %%mm4,%0" : "=m" (q[d]));
+                asm volatile("movq %%mm6,%0" : "=m" (q[d+8]));
+        }
+        raid6_after_mmx(&sa);
+}
+const struct raid6_calls raid6_mmxx2 = {
+        raid6_mmx2_gen_syndrome,
+        raid6_have_mmx,
+        "mmxx2",
+        0
+};
+#endif
diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c
new file mode 100644
index 000000000000..a8c4d9451bd9
--- /dev/null
+++ b/drivers/md/raid6recov.c
@@ -0,0 +1,133 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * raid6recov.c
+ *
+ * RAID-6 data recovery in dual failure mode.  In single failure mode,
+ * use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct
+ * the syndrome.)
+ */
+#include "raid6.h"
+/* Recover two failed data blocks. */
+void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+                       void **ptrs)
+{
+        u8 *p, *q, *dp, *dq;
+        u8 px, qx, db;
+        const u8 *pbmul;        /* P multiplier table for B data */
+        const u8 *qmul;         /* Q multiplier table (for both) */
+        p = (u8 *)ptrs[disks-2];
+        q = (u8 *)ptrs[disks-1];
+        /* Compute syndrome with zero for the missing data pages
+           Use the dead data pages as temporary storage for
+           delta p and delta q */
+        dp = (u8 *)ptrs[faila];
+        ptrs[faila] = (void *)raid6_empty_zero_page;
+        ptrs[disks-2] = dp;
+        dq = (u8 *)ptrs[failb];
+        ptrs[failb] = (void *)raid6_empty_zero_page;
+        ptrs[disks-1] = dq;
+        raid6_call.gen_syndrome(disks, bytes, ptrs);
+        /* Restore pointer table */
+        ptrs[faila]   = dp;
+        ptrs[failb]   = dq;
+        ptrs[disks-2] = p;
+        ptrs[disks-1] = q;
+        /* Now, pick the proper data tables */
+        pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
+        qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
+        /* Now do it... */
+        while ( bytes-- ) {
+                px    = *p ^ *dp;
+                qx    = qmul[*q ^ *dq];
+                *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
+                *dp++ = db ^ px; /* Reconstructed A */
+                p++; q++;
+        }
+}
+/* Recover failure of one data block plus the P block */
+void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
+{
+        u8 *p, *q, *dq;
+        const u8 *qmul;         /* Q multiplier table */
+        p = (u8 *)ptrs[disks-2];
+        q = (u8 *)ptrs[disks-1];
+        /* Compute syndrome with zero for the missing data page
+           Use the dead data page as temporary storage for delta q */
+        dq = (u8 *)ptrs[faila];
+        ptrs[faila] = (void *)raid6_empty_zero_page;
+        ptrs[disks-1] = dq;
+        raid6_call.gen_syndrome(disks, bytes, ptrs);
+        /* Restore pointer table */
+        ptrs[faila]   = dq;
+        ptrs[disks-1] = q;
+        /* Now, pick the proper data tables */
+        qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
+        /* Now do it... */
+        while ( bytes-- ) {
+                *p++ ^= *dq = qmul[*q ^ *dq];
+                q++; dq++;
+        }
+}
+#ifndef __KERNEL__              /* Testing only */
+/* Recover two failed blocks. */
+void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
+{
+        if ( faila > failb ) {
+                int tmp = faila;
+                faila = failb;
+                failb = tmp;
+        }
+        if ( failb == disks-1 ) {
+                if ( faila == disks-2 ) {
+                        /* P+Q failure.  Just rebuild the syndrome. */
+                        raid6_call.gen_syndrome(disks, bytes, ptrs);
+                } else {
+                        /* data+Q failure.  Reconstruct data from P,
+                           then rebuild syndrome. */
+                        /* NOT IMPLEMENTED - equivalent to RAID-5 */
+                }
+        } else {
+                if ( failb == disks-2 ) {
+                        /* data+P failure. */
+                        raid6_datap_recov(disks, bytes, faila, ptrs);
+                } else {
+                        /* data+data failure. */
+                        raid6_2data_recov(disks, bytes, faila, failb, ptrs);
+                }
+        }
+}
+#endif
diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c
new file mode 100644
index 000000000000..f7e7859f71aa
--- /dev/null
+++ b/drivers/md/raid6sse1.c
@@ -0,0 +1,171 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * raid6sse1.c
+ *
+ * SSE-1/MMXEXT implementation of RAID-6 syndrome functions
+ *
+ * This is really an MMX implementation, but it requires SSE-1 or
+ * AMD MMXEXT for prefetch support and a few other features.  The
+ * support for nontemporal memory accesses is enough to make this
+ * worthwhile as a separate implementation.
+ */
+#if defined(__i386__)
+#include "raid6.h"
+#include "raid6x86.h"
+/* Defined in raid6mmx.c */
+extern const struct raid6_mmx_constants {
+        u64 x1d;
+} raid6_mmx_constants;
+static int raid6_have_sse1_or_mmxext(void)
+{
+#ifdef __KERNEL__
+        /* Not really boot_cpu but "all_cpus" */
+        return boot_cpu_has(X86_FEATURE_MMX) &&
+                (boot_cpu_has(X86_FEATURE_XMM) ||
+                 boot_cpu_has(X86_FEATURE_MMXEXT));
+#else
+        /* User space test code - this incorrectly breaks on some Athlons */
+        u32 features = cpuid_features();
+        return ( (features & (5<<23)) == (5<<23) );
+#endif
+}
+/*
+ * Plain SSE1 implementation
+ */
+static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+        u8 **dptr = (u8 **)ptrs;
+        u8 *p, *q;
+        int d, z, z0;
+        raid6_mmx_save_t sa;
+        z0 = disks - 3;         /* Highest data disk */
+        p = dptr[z0+1];         /* XOR parity */
+        q = dptr[z0+2];         /* RS syndrome */
+        /* This is really MMX code, not SSE */
+        raid6_before_mmx(&sa);
+        asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
+        asm volatile("pxor %mm5,%mm5"); /* Zero temp */
+        for ( d = 0 ; d < bytes ; d += 8 ) {
+                asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+                asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
+                asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
+                asm volatile("movq %mm2,%mm4"); /* Q[0] */
+                asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d]));
+                for ( z = z0-2 ; z >= 0 ; z-- ) {
+                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+                        asm volatile("pcmpgtb %mm4,%mm5");
+                        asm volatile("paddb %mm4,%mm4");
+                        asm volatile("pand %mm0,%mm5");
+                        asm volatile("pxor %mm5,%mm4");
+                        asm volatile("pxor %mm5,%mm5");
+                        asm volatile("pxor %mm6,%mm2");
+                        asm volatile("pxor %mm6,%mm4");
+                        asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d]));
+                }
+                asm volatile("pcmpgtb %mm4,%mm5");
+                asm volatile("paddb %mm4,%mm4");
+                asm volatile("pand %mm0,%mm5");
+                asm volatile("pxor %mm5,%mm4");
+                asm volatile("pxor %mm5,%mm5");
+                asm volatile("pxor %mm6,%mm2");
+                asm volatile("pxor %mm6,%mm4");
+                asm volatile("movntq %%mm2,%0" : "=m" (p[d]));
+                asm volatile("movntq %%mm4,%0" : "=m" (q[d]));
+        }
+        raid6_after_mmx(&sa);
+        asm volatile("sfence" : : : "memory");
+}
+const struct raid6_calls raid6_sse1x1 = {
+        raid6_sse11_gen_syndrome,
+        raid6_have_sse1_or_mmxext,
+        "sse1x1",
+        1                       /* Has cache hints */
+};
+/*
+ * Unrolled-by-2 SSE1 implementation
+ */
+static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+        u8 **dptr = (u8 **)ptrs;
+        u8 *p, *q;
+        int d, z, z0;
+        raid6_mmx_save_t sa;
+        z0 = disks - 3;         /* Highest data disk */
+        p = dptr[z0+1];         /* XOR parity */
+        q = dptr[z0+2];         /* RS syndrome */
+        raid6_before_mmx(&sa);
+        asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
+        asm volatile("pxor %mm5,%mm5"); /* Zero temp */
+        asm volatile("pxor %mm7,%mm7"); /* Zero temp */
+        /* We uniformly assume a single prefetch covers at least 16 bytes */
+        for ( d = 0 ; d < bytes ; d += 16 ) {
+                asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+                asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
+                asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */
+                asm volatile("movq %mm2,%mm4"); /* Q[0] */
+                asm volatile("movq %mm3,%mm6"); /* Q[1] */
+                for ( z = z0-1 ; z >= 0 ; z-- ) {
+                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+                        asm volatile("pcmpgtb %mm4,%mm5");
+                        asm volatile("pcmpgtb %mm6,%mm7");
+                        asm volatile("paddb %mm4,%mm4");
+                        asm volatile("paddb %mm6,%mm6");
+                        asm volatile("pand %mm0,%mm5");
+                        asm volatile("pand %mm0,%mm7");
+                        asm volatile("pxor %mm5,%mm4");
+                        asm volatile("pxor %mm7,%mm6");
+                        asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d]));
+                        asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8]));
+                        asm volatile("pxor %mm5,%mm2");
+                        asm volatile("pxor %mm7,%mm3");
+                        asm volatile("pxor %mm5,%mm4");
+                        asm volatile("pxor %mm7,%mm6");
+                        asm volatile("pxor %mm5,%mm5");
+                        asm volatile("pxor %mm7,%mm7");
+                }
+                asm volatile("movntq %%mm2,%0" : "=m" (p[d]));
+                asm volatile("movntq %%mm3,%0" : "=m" (p[d+8]));
+                asm volatile("movntq %%mm4,%0" : "=m" (q[d]));
+                asm volatile("movntq %%mm6,%0" : "=m" (q[d+8]));
+        }
+        raid6_after_mmx(&sa);
+        asm volatile("sfence" : :: "memory");
+}
+const struct raid6_calls raid6_sse1x2 = {
+        raid6_sse12_gen_syndrome,
+        raid6_have_sse1_or_mmxext,
+        "sse1x2",
+        1                       /* Has cache hints */
+};
+#endif
diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c
new file mode 100644
index 000000000000..b3aa7fe0877e
--- /dev/null
+++ b/drivers/md/raid6sse2.c
@@ -0,0 +1,270 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * raid6sse2.c
+ *
+ * SSE-2 implementation of RAID-6 syndrome functions
+ *
+ */
+#if defined(__i386__) || defined(__x86_64__)
+#include "raid6.h"
+#include "raid6x86.h"
+static const struct raid6_sse_constants {
+        u64 x1d[2];
+} raid6_sse_constants  __attribute__((aligned(16))) = {
+        { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
+};
+static int raid6_have_sse2(void)
+{
+#ifdef __KERNEL__
+        /* Not really boot_cpu but "all_cpus" */
+        return boot_cpu_has(X86_FEATURE_MMX) &&
+                boot_cpu_has(X86_FEATURE_FXSR) &&
+                boot_cpu_has(X86_FEATURE_XMM) &&
+                boot_cpu_has(X86_FEATURE_XMM2);
+#else
+        /* User space test code */
+        u32 features = cpuid_features();
+        return ( (features & (15<<23)) == (15<<23) );
+#endif
+}
+/*
+ * Plain SSE2 implementation
+ */
+static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+        u8 **dptr = (u8 **)ptrs;
+        u8 *p, *q;
+        int d, z, z0;
+        raid6_sse_save_t sa;
+        z0 = disks - 3;         /* Highest data disk */
+        p = dptr[z0+1];         /* XOR parity */
+        q = dptr[z0+2];         /* RS syndrome */
+        raid6_before_sse2(&sa);
+        asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
+        asm volatile("pxor %xmm5,%xmm5");       /* Zero temp */
+        for ( d = 0 ; d < bytes ; d += 16 ) {
+                asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+                asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
+                asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
+                asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
+                asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
+                for ( z = z0-2 ; z >= 0 ; z-- ) {
+                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+                        asm volatile("pcmpgtb %xmm4,%xmm5");
+                        asm volatile("paddb %xmm4,%xmm4");
+                        asm volatile("pand %xmm0,%xmm5");
+                        asm volatile("pxor %xmm5,%xmm4");
+                        asm volatile("pxor %xmm5,%xmm5");
+                        asm volatile("pxor %xmm6,%xmm2");
+                        asm volatile("pxor %xmm6,%xmm4");
+                        asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
+                }
+                asm volatile("pcmpgtb %xmm4,%xmm5");
+                asm volatile("paddb %xmm4,%xmm4");
+                asm volatile("pand %xmm0,%xmm5");
+                asm volatile("pxor %xmm5,%xmm4");
+                asm volatile("pxor %xmm5,%xmm5");
+                asm volatile("pxor %xmm6,%xmm2");
+                asm volatile("pxor %xmm6,%xmm4");
+                asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
+                asm volatile("pxor %xmm2,%xmm2");
+                asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
+                asm volatile("pxor %xmm4,%xmm4");
+        }
+        raid6_after_sse2(&sa);
+        asm volatile("sfence" : : : "memory");
+}
+const struct raid6_calls raid6_sse2x1 = {
+        raid6_sse21_gen_syndrome,
+        raid6_have_sse2,
+        "sse2x1",
+        1                       /* Has cache hints */
+};
+/*
+ * Unrolled-by-2 SSE2 implementation
+ */
+static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+        u8 **dptr = (u8 **)ptrs;
+        u8 *p, *q;
+        int d, z, z0;
+        raid6_sse_save_t sa;
+        z0 = disks - 3;         /* Highest data disk */
+        p = dptr[z0+1];         /* XOR parity */
+        q = dptr[z0+2];         /* RS syndrome */
+        raid6_before_sse2(&sa);
+        asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
+        asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
+        asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
+        /* We uniformly assume a single prefetch covers at least 32 bytes */
+        for ( d = 0 ; d < bytes ; d += 32 ) {
+                asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+                asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */
+                asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
+                asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
+                asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
+                for ( z = z0-1 ; z >= 0 ; z-- ) {
+                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+                        asm volatile("pcmpgtb %xmm4,%xmm5");
+                        asm volatile("pcmpgtb %xmm6,%xmm7");
+                        asm volatile("paddb %xmm4,%xmm4");
+                        asm volatile("paddb %xmm6,%xmm6");
+                        asm volatile("pand %xmm0,%xmm5");
+                        asm volatile("pand %xmm0,%xmm7");
+                        asm volatile("pxor %xmm5,%xmm4");
+                        asm volatile("pxor %xmm7,%xmm6");
+                        asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
+                        asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
+                        asm volatile("pxor %xmm5,%xmm2");
+                        asm volatile("pxor %xmm7,%xmm3");
+                        asm volatile("pxor %xmm5,%xmm4");
+                        asm volatile("pxor %xmm7,%xmm6");
+                        asm volatile("pxor %xmm5,%xmm5");
+                        asm volatile("pxor %xmm7,%xmm7");
+                }
+                asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
+                asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
+                asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
+                asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
+        }
+        raid6_after_sse2(&sa);
+        asm volatile("sfence" : : : "memory");
+}
+const struct raid6_calls raid6_sse2x2 = {
+        raid6_sse22_gen_syndrome,
+        raid6_have_sse2,
+        "sse2x2",
+        1                       /* Has cache hints */
+};
+#endif
+#ifdef __x86_64__
+/*
+ * Unrolled-by-4 SSE2 implementation
+ */
+static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+        u8 **dptr = (u8 **)ptrs;
+        u8 *p, *q;
+        int d, z, z0;
+        raid6_sse16_save_t sa;
+        z0 = disks - 3;         /* Highest data disk */
+        p = dptr[z0+1];         /* XOR parity */
+        q = dptr[z0+2];         /* RS syndrome */
+        raid6_before_sse16(&sa);
+        asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
+        asm volatile("pxor %xmm2,%xmm2");       /* P[0] */
+        asm volatile("pxor %xmm3,%xmm3");       /* P[1] */
+        asm volatile("pxor %xmm4,%xmm4");       /* Q[0] */
+        asm volatile("pxor %xmm5,%xmm5");       /* Zero temp */
+        asm volatile("pxor %xmm6,%xmm6");       /* Q[1] */
+        asm volatile("pxor %xmm7,%xmm7");       /* Zero temp */
+        asm volatile("pxor %xmm10,%xmm10");     /* P[2] */
+        asm volatile("pxor %xmm11,%xmm11");     /* P[3] */
+        asm volatile("pxor %xmm12,%xmm12");     /* Q[2] */
+        asm volatile("pxor %xmm13,%xmm13");     /* Zero temp */
+        asm volatile("pxor %xmm14,%xmm14");     /* Q[3] */
+        asm volatile("pxor %xmm15,%xmm15");     /* Zero temp */
+        for ( d = 0 ; d < bytes ; d += 64 ) {
+                for ( z = z0 ; z >= 0 ; z-- ) {
+                        /* The second prefetch seems to improve performance... */
+                        asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
+                        asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
+                        asm volatile("pcmpgtb %xmm4,%xmm5");
+                        asm volatile("pcmpgtb %xmm6,%xmm7");
+                        asm volatile("pcmpgtb %xmm12,%xmm13");
+                        asm volatile("pcmpgtb %xmm14,%xmm15");
+                        asm volatile("paddb %xmm4,%xmm4");
+                        asm volatile("paddb %xmm6,%xmm6");
+                        asm volatile("paddb %xmm12,%xmm12");
+                        asm volatile("paddb %xmm14,%xmm14");
+                        asm volatile("pand %xmm0,%xmm5");
+                        asm volatile("pand %xmm0,%xmm7");
+                        asm volatile("pand %xmm0,%xmm13");
+                        asm volatile("pand %xmm0,%xmm15");
+                        asm volatile("pxor %xmm5,%xmm4");
+                        asm volatile("pxor %xmm7,%xmm6");
+                        asm volatile("pxor %xmm13,%xmm12");
+                        asm volatile("pxor %xmm15,%xmm14");
+                        asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
+                        asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
+                        asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
+                        asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
+                        asm volatile("pxor %xmm5,%xmm2");
+                        asm volatile("pxor %xmm7,%xmm3");
+                        asm volatile("pxor %xmm13,%xmm10");
+                        asm volatile("pxor %xmm15,%xmm11");
+                        asm volatile("pxor %xmm5,%xmm4");
+                        asm volatile("pxor %xmm7,%xmm6");
+                        asm volatile("pxor %xmm13,%xmm12");
+                        asm volatile("pxor %xmm15,%xmm14");
+                        asm volatile("pxor %xmm5,%xmm5");
+                        asm volatile("pxor %xmm7,%xmm7");
+                        asm volatile("pxor %xmm13,%xmm13");
+                        asm volatile("pxor %xmm15,%xmm15");
+                }
+                asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
+                asm volatile("pxor %xmm2,%xmm2");
+                asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
+                asm volatile("pxor %xmm3,%xmm3");
+                asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
+                asm volatile("pxor %xmm10,%xmm10");
+                asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
+                asm volatile("pxor %xmm11,%xmm11");
+                asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
+                asm volatile("pxor %xmm4,%xmm4");
+                asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
+                asm volatile("pxor %xmm6,%xmm6");
+                asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
+                asm volatile("pxor %xmm12,%xmm12");
+                asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
+                asm volatile("pxor %xmm14,%xmm14");
+        }
+        asm volatile("sfence" : : : "memory");
+        raid6_after_sse16(&sa);
+}
+const struct raid6_calls raid6_sse2x4 = {
+        raid6_sse24_gen_syndrome,
+        raid6_have_sse2,
+        "sse2x4",
+        1                       /* Has cache hints */
+};
+#endif
diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile
new file mode 100644
index 000000000000..557806728609
--- /dev/null
+++ b/drivers/md/raid6test/Makefile
@@ -0,0 +1,58 @@
+#
+# This is a simple Makefile to test some of the RAID-6 code
+# from userspace.
+#
+CC       = gcc
+OPTFLAGS = -O2                  # Adjust as desired
+CFLAGS   = -I.. -g $(OPTFLAGS)
+LD       = ld
+PERL     = perl
+.c.o:
+        $(CC) $(CFLAGS) -c -o $@ $<
+%.c: ../%.c
+        cp -f $< $@
+%.uc: ../%.uc
+        cp -f $< $@
+all:    raid6.o raid6test
+raid6.o: raid6int1.o raid6int2.o raid6int4.o raid6int8.o raid6int16.o \
+         raid6int32.o \
+         raid6mmx.o raid6sse1.o raid6sse2.o \
+         raid6recov.o raid6algos.o \
+         raid6tables.o
+        $(LD) -r -o $@ $^
+raid6test: raid6.o test.c
+        $(CC) $(CFLAGS) -o raid6test $^
+raid6int1.c: raid6int.uc ../unroll.pl
+        $(PERL) ../unroll.pl 1 < raid6int.uc > $@
+raid6int2.c: raid6int.uc ../unroll.pl
+        $(PERL) ../unroll.pl 2 < raid6int.uc > $@
+raid6int4.c: raid6int.uc ../unroll.pl
+        $(PERL) ../unroll.pl 4 < raid6int.uc > $@
+raid6int8.c: raid6int.uc ../unroll.pl
+        $(PERL) ../unroll.pl 8 < raid6int.uc > $@
+raid6int16.c: raid6int.uc ../unroll.pl
+        $(PERL) ../unroll.pl 16 < raid6int.uc > $@
+raid6int32.c: raid6int.uc ../unroll.pl
+        $(PERL) ../unroll.pl 32 < raid6int.uc > $@
+raid6tables.c: mktables
+        ./mktables > raid6tables.c
+clean:
+        rm -f *.o mktables mktables.c raid6int.uc raid6*.c raid6test
+spotless: clean
+        rm -f *~
diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c
new file mode 100644
index 000000000000..0d5cd57accd7
--- /dev/null
+++ b/drivers/md/raid6test/test.c
@@ -0,0 +1,103 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * raid6test.c
+ *
+ * Test RAID-6 recovery with various algorithms
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "raid6.h"
+#define NDISKS          16      /* Including P and Q */
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+struct raid6_calls raid6_call;
+char *dataptrs[NDISKS];
+char data[NDISKS][PAGE_SIZE];
+char recovi[PAGE_SIZE], recovj[PAGE_SIZE];
+void makedata(void)
+{
+        int i, j;
+        for (  i = 0 ; i < NDISKS ; i++ ) {
+                for ( j = 0 ; j < PAGE_SIZE ; j++ ) {
+                        data[i][j] = rand();
+                }
+                dataptrs[i] = data[i];
+        }
+}
+int main(int argc, char *argv[])
+{
+        const struct raid6_calls * const * algo;
+        int i, j;
+        int erra, errb;
+        makedata();
+        for ( algo = raid6_algos ; *algo ; algo++ ) {
+                if ( !(*algo)->valid || (*algo)->valid() ) {
+                        raid6_call = **algo;
+                        /* Nuke syndromes */
+                        memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
+                        /* Generate assumed good syndrome */
+                        raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, (void **)&dataptrs);
+                        for ( i = 0 ; i < NDISKS-1 ; i++ ) {
+                                for ( j = i+1 ; j < NDISKS ; j++ ) {
+                                        memset(recovi, 0xf0, PAGE_SIZE);
+                                        memset(recovj, 0xba, PAGE_SIZE);
+                                        dataptrs[i] = recovi;
+                                        dataptrs[j] = recovj;
+                                        raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs);
+                                        erra = memcmp(data[i], recovi, PAGE_SIZE);
+                                        errb = memcmp(data[j], recovj, PAGE_SIZE);
+                                        if ( i < NDISKS-2 && j == NDISKS-1 ) {
+                                                /* We don't implement the DQ failure scenario, since it's
+                                                   equivalent to a RAID-5 failure (XOR, then recompute Q) */
+                                        } else {
+                                                printf("algo=%-8s  faila=%3d(%c)  failb=%3d(%c)  %s\n",
+                                                       raid6_call.name,
+                                                       i, (i==NDISKS-2)?'P':'D',
+                                                       j, (j==NDISKS-1)?'Q':(j==NDISKS-2)?'P':'D',
+                                                       (!erra && !errb) ? "OK" :
+                                                       !erra ? "ERRB" :
+                                                       !errb ? "ERRA" :
+                                                       "ERRAB");
+                                        }
+                                        dataptrs[i] = data[i];
+                                        dataptrs[j] = data[j];
+                                }
+                        }
+                }
+                printf("\n");
+        }
+        printf("\n");
+        /* Pick the best algorithm test */
+        raid6_select_algo();
+        return 0;
+}
diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h
new file mode 100644
index 000000000000..4cf20534fe44
--- /dev/null
+++ b/drivers/md/raid6x86.h
@@ -0,0 +1,245 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *   Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * raid6x86.h
+ *
+ * Definitions common to x86 and x86-64 RAID-6 code only
+ */
+#ifndef LINUX_RAID_RAID6X86_H
+#define LINUX_RAID_RAID6X86_H
+#if defined(__i386__) || defined(__x86_64__)
+#ifdef __x86_64__
+typedef struct {
+        unsigned int fsave[27];
+        unsigned long cr0;
+} raid6_mmx_save_t __attribute__((aligned(16)));
+/* N.B.: For SSE we only save %xmm0-%xmm7 even for x86-64, since
+   the code doesn't know about the additional x86-64 registers */
+typedef struct {
+        unsigned int sarea[8*4+2];
+        unsigned long cr0;
+} raid6_sse_save_t __attribute__((aligned(16)));
+/* This is for x86-64-specific code which uses all 16 XMM registers */
+typedef struct {
+        unsigned int sarea[16*4+2];
+        unsigned long cr0;
+} raid6_sse16_save_t __attribute__((aligned(16)));
+/* On x86-64 the stack *SHOULD* be 16-byte aligned, but currently this
+   is buggy in the kernel and it's only 8-byte aligned in places, so
+   we need to do this anyway.  Sigh. */
+#define SAREA(x) ((unsigned int *)((((unsigned long)&(x)->sarea)+15) & ~15))
+#else /* __i386__ */
+typedef struct {
+        unsigned int fsave[27];
+        unsigned long cr0;
+} raid6_mmx_save_t;
+/* On i386, the stack is only 8-byte aligned, but SSE requires 16-byte
+   alignment.  The +3 is so we have the slack space to manually align
+   a properly-sized area correctly.  */
+typedef struct {
+        unsigned int sarea[8*4+3];
+        unsigned long cr0;
+} raid6_sse_save_t;
+/* Find the 16-byte aligned save area */
+#define SAREA(x) ((unsigned int *)((((unsigned long)&(x)->sarea)+15) & ~15))
+#endif
+#ifdef __KERNEL__ /* Real code */
+/* Note: %cr0 is 32 bits on i386 and 64 bits on x86-64 */
+static inline unsigned long raid6_get_fpu(void)
+{
+        unsigned long cr0;
+        preempt_disable();
+        asm volatile("mov %%cr0,%0 ; clts" : "=r" (cr0));
+        return cr0;
+}
+static inline void raid6_put_fpu(unsigned long cr0)
+{
+        asm volatile("mov %0,%%cr0" : : "r" (cr0));
+        preempt_enable();
+}
+#else /* Dummy code for user space testing */
+static inline unsigned long raid6_get_fpu(void)
+{
+        return 0xf00ba6;
+}
+static inline void raid6_put_fpu(unsigned long cr0)
+{
+        (void)cr0;
+}
+#endif
+static inline void raid6_before_mmx(raid6_mmx_save_t *s)
+{
+        s->cr0 = raid6_get_fpu();
+        asm volatile("fsave %0 ; fwait" : "=m" (s->fsave[0]));
+}
+static inline void raid6_after_mmx(raid6_mmx_save_t *s)
+{
+        asm volatile("frstor %0" : : "m" (s->fsave[0]));
+        raid6_put_fpu(s->cr0);
+}
+static inline void raid6_before_sse(raid6_sse_save_t *s)
+{
+        unsigned int *rsa = SAREA(s);
+        s->cr0 = raid6_get_fpu();
+        asm volatile("movaps %%xmm0,%0" : "=m" (rsa[0]));
+        asm volatile("movaps %%xmm1,%0" : "=m" (rsa[4]));
+        asm volatile("movaps %%xmm2,%0" : "=m" (rsa[8]));
+        asm volatile("movaps %%xmm3,%0" : "=m" (rsa[12]));
+        asm volatile("movaps %%xmm4,%0" : "=m" (rsa[16]));
+        asm volatile("movaps %%xmm5,%0" : "=m" (rsa[20]));
+        asm volatile("movaps %%xmm6,%0" : "=m" (rsa[24]));
+        asm volatile("movaps %%xmm7,%0" : "=m" (rsa[28]));
+}
+static inline void raid6_after_sse(raid6_sse_save_t *s)
+{
+        unsigned int *rsa = SAREA(s);
+        asm volatile("movaps %0,%%xmm0" : : "m" (rsa[0]));
+        asm volatile("movaps %0,%%xmm1" : : "m" (rsa[4]));
+        asm volatile("movaps %0,%%xmm2" : : "m" (rsa[8]));
+        asm volatile("movaps %0,%%xmm3" : : "m" (rsa[12]));
+        asm volatile("movaps %0,%%xmm4" : : "m" (rsa[16]));
+        asm volatile("movaps %0,%%xmm5" : : "m" (rsa[20]));
+        asm volatile("movaps %0,%%xmm6" : : "m" (rsa[24]));
+        asm volatile("movaps %0,%%xmm7" : : "m" (rsa[28]));
+        raid6_put_fpu(s->cr0);
+}
+static inline void raid6_before_sse2(raid6_sse_save_t *s)
+{
+        unsigned int *rsa = SAREA(s);
+        s->cr0 = raid6_get_fpu();
+        asm volatile("movdqa %%xmm0,%0" : "=m" (rsa[0]));
+        asm volatile("movdqa %%xmm1,%0" : "=m" (rsa[4]));
+        asm volatile("movdqa %%xmm2,%0" : "=m" (rsa[8]));
+        asm volatile("movdqa %%xmm3,%0" : "=m" (rsa[12]));
+        asm volatile("movdqa %%xmm4,%0" : "=m" (rsa[16]));
+        asm volatile("movdqa %%xmm5,%0" : "=m" (rsa[20]));
+        asm volatile("movdqa %%xmm6,%0" : "=m" (rsa[24]));
+        asm volatile("movdqa %%xmm7,%0" : "=m" (rsa[28]));
+}
+static inline void raid6_after_sse2(raid6_sse_save_t *s)
+{
+        unsigned int *rsa = SAREA(s);
+        asm volatile("movdqa %0,%%xmm0" : : "m" (rsa[0]));
+        asm volatile("movdqa %0,%%xmm1" : : "m" (rsa[4]));
+        asm volatile("movdqa %0,%%xmm2" : : "m" (rsa[8]));
+        asm volatile("movdqa %0,%%xmm3" : : "m" (rsa[12]));
+        asm volatile("movdqa %0,%%xmm4" : : "m" (rsa[16]));
+        asm volatile("movdqa %0,%%xmm5" : : "m" (rsa[20]));
+        asm volatile("movdqa %0,%%xmm6" : : "m" (rsa[24]));
+        asm volatile("movdqa %0,%%xmm7" : : "m" (rsa[28]));
+        raid6_put_fpu(s->cr0);
+}
+#ifdef __x86_64__
+static inline void raid6_before_sse16(raid6_sse16_save_t *s)
+{
+        unsigned int *rsa = SAREA(s);
+        s->cr0 = raid6_get_fpu();
+        asm volatile("movdqa %%xmm0,%0" : "=m" (rsa[0]));
+        asm volatile("movdqa %%xmm1,%0" : "=m" (rsa[4]));
+        asm volatile("movdqa %%xmm2,%0" : "=m" (rsa[8]));
+        asm volatile("movdqa %%xmm3,%0" : "=m" (rsa[12]));
+        asm volatile("movdqa %%xmm4,%0" : "=m" (rsa[16]));
+        asm volatile("movdqa %%xmm5,%0" : "=m" (rsa[20]));
+        asm volatile("movdqa %%xmm6,%0" : "=m" (rsa[24]));
+        asm volatile("movdqa %%xmm7,%0" : "=m" (rsa[28]));
+        asm volatile("movdqa %%xmm8,%0" : "=m" (rsa[32]));
+        asm volatile("movdqa %%xmm9,%0" : "=m" (rsa[36]));
+        asm volatile("movdqa %%xmm10,%0" : "=m" (rsa[40]));
+        asm volatile("movdqa %%xmm11,%0" : "=m" (rsa[44]));
+        asm volatile("movdqa %%xmm12,%0" : "=m" (rsa[48]));
+        asm volatile("movdqa %%xmm13,%0" : "=m" (rsa[52]));
+        asm volatile("movdqa %%xmm14,%0" : "=m" (rsa[56]));
+        asm volatile("movdqa %%xmm15,%0" : "=m" (rsa[60]));
+}
+static inline void raid6_after_sse16(raid6_sse16_save_t *s)
+{
+        unsigned int *rsa = SAREA(s);
+        asm volatile("movdqa %0,%%xmm0" : : "m" (rsa[0]));
+        asm volatile("movdqa %0,%%xmm1" : : "m" (rsa[4]));
+        asm volatile("movdqa %0,%%xmm2" : : "m" (rsa[8]));
+        asm volatile("movdqa %0,%%xmm3" : : "m" (rsa[12]));
+        asm volatile("movdqa %0,%%xmm4" : : "m" (rsa[16]));
+        asm volatile("movdqa %0,%%xmm5" : : "m" (rsa[20]));
+        asm volatile("movdqa %0,%%xmm6" : : "m" (rsa[24]));
+        asm volatile("movdqa %0,%%xmm7" : : "m" (rsa[28]));
+        asm volatile("movdqa %0,%%xmm8" : : "m" (rsa[32]));
+        asm volatile("movdqa %0,%%xmm9" : : "m" (rsa[36]));
+        asm volatile("movdqa %0,%%xmm10" : : "m" (rsa[40]));
+        asm volatile("movdqa %0,%%xmm11" : : "m" (rsa[44]));
+        asm volatile("movdqa %0,%%xmm12" : : "m" (rsa[48]));
+        asm volatile("movdqa %0,%%xmm13" : : "m" (rsa[52]));
+        asm volatile("movdqa %0,%%xmm14" : : "m" (rsa[56]));
+        asm volatile("movdqa %0,%%xmm15" : : "m" (rsa[60]));
+        raid6_put_fpu(s->cr0);
+}
+#endif /* __x86_64__ */
+/* User space test hack */
+#ifndef __KERNEL__
+static inline int cpuid_features(void)
+{
+        u32 eax = 1;
+        u32 ebx, ecx, edx;
+        asm volatile("cpuid" :
+                     "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx));
+        return edx;
+}
+#endif /* ndef __KERNEL__ */
+#endif
+#endif
diff --git a/drivers/md/unroll.pl b/drivers/md/unroll.pl
new file mode 100644
index 000000000000..3acc710a20ea
--- /dev/null
+++ b/drivers/md/unroll.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/perl
+#
+# Take a piece of C code and for each line which contains the sequence $$
+# repeat n times with $ replaced by 0...n-1; the sequence $# is replaced
+# by the unrolling factor, and $* with a single $
+#
+($n) = @ARGV;
+$n += 0;
+while ( defined($line = <STDIN>) ) {
+    if ( $line =~ /\$\$/ ) {
+        $rep = $n;
+    } else {
+        $rep = 1;
+    }
+    for ( $i = 0 ; $i < $rep ; $i++ ) {
+        $tmp = $line;
+        $tmp =~ s/\$\$/$i/g;
+        $tmp =~ s/\$\#/$n/g;
+        $tmp =~ s/\$\*/\$/g;
+        print $tmp;
+    }
+}
diff --git a/drivers/md/xor.c b/drivers/md/xor.c
new file mode 100644
index 000000000000..324897c4be4e
--- /dev/null
+++ b/drivers/md/xor.c
@@ -0,0 +1,154 @@
+/*
+ * xor.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000,
+ * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
+ *
+ * Dispatch optimized RAID-5 checksumming functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#define BH_TRACE 0
+#include <linux/module.h>
+#include <linux/raid/md.h>
+#include <linux/raid/xor.h>
+#include <asm/xor.h>
+/* The xor routines to use.  */
+static struct xor_block_template *active_template;
+void
+xor_block(unsigned int count, unsigned int bytes, void **ptr)
+{
+        unsigned long *p0, *p1, *p2, *p3, *p4;
+        p0 = (unsigned long *) ptr[0];
+        p1 = (unsigned long *) ptr[1];
+        if (count == 2) {
+                active_template->do_2(bytes, p0, p1);
+                return;
+        }
+        p2 = (unsigned long *) ptr[2];
+        if (count == 3) {
+                active_template->do_3(bytes, p0, p1, p2);
+                return;
+        }
+        p3 = (unsigned long *) ptr[3];
+        if (count == 4) {
+                active_template->do_4(bytes, p0, p1, p2, p3);
+                return;
+        }
+        p4 = (unsigned long *) ptr[4];
+        active_template->do_5(bytes, p0, p1, p2, p3, p4);
+}
+/* Set of all registered templates.  */
+static struct xor_block_template *template_list;
+#define BENCH_SIZE (PAGE_SIZE)
+static void
+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
+{
+        int speed;
+        unsigned long now;
+        int i, count, max;
+        tmpl->next = template_list;
+        template_list = tmpl;
+        /*
+         * Count the number of XORs done during a whole jiffy, and use
+         * this to calculate the speed of checksumming.  We use a 2-page
+         * allocation to have guaranteed color L1-cache layout.
+         */
+        max = 0;
+        for (i = 0; i < 5; i++) {
+                now = jiffies;
+                count = 0;
+                while (jiffies == now) {
+                        mb();
+                        tmpl->do_2(BENCH_SIZE, b1, b2);
+                        mb();
+                        count++;
+                        mb();
+                }
+                if (count > max)
+                        max = count;
+        }
+        speed = max * (HZ * BENCH_SIZE / 1024);
+        tmpl->speed = speed;
+        printk("   %-10s: %5d.%03d MB/sec\n", tmpl->name,
+               speed / 1000, speed % 1000);
+}
+static int
+calibrate_xor_block(void)
+{
+        void *b1, *b2;
+        struct xor_block_template *f, *fastest;
+        b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
+        if (! b1) {
+                printk("raid5: Yikes!  No memory available.\n");
+                return -ENOMEM;
+        }
+        b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
+        /*
+         * If this arch/cpu has a short-circuited selection, don't loop through all
+         * the possible functions, just test the best one
+         */
+        fastest = NULL;
+#ifdef XOR_SELECT_TEMPLATE
+                fastest = XOR_SELECT_TEMPLATE(fastest);
+#endif
+#define xor_speed(templ)        do_xor_speed((templ), b1, b2)
+        if (fastest) {
+                printk(KERN_INFO "raid5: automatically using best checksumming function: %s\n",
+                        fastest->name);
+                xor_speed(fastest);
+        } else {
+                printk(KERN_INFO "raid5: measuring checksumming speed\n");
+                XOR_TRY_TEMPLATES;
+                fastest = template_list;
+                for (f = fastest; f; f = f->next)
+                        if (f->speed > fastest->speed)
+                                fastest = f;
+        }
+        printk("raid5: using function: %s (%d.%03d MB/sec)\n",
+               fastest->name, fastest->speed / 1000, fastest->speed % 1000);
+#undef xor_speed
+        free_pages((unsigned long)b1, 2);
+        active_template = fastest;
+        return 0;
+}
+static __exit void xor_exit(void) { }
+EXPORT_SYMBOL(xor_block);
+MODULE_LICENSE("GPL");
+module_init(calibrate_xor_block);
+module_exit(xor_exit);
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /drivers/md