31 files changed, 3324 insertions, 837 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2281b5098e95..36e0675be9f7 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -121,6 +121,7 @@ config MD_RAID10
 config MD_RAID456
        tristate "RAID-4/RAID-5/RAID-6 mode"
        depends on BLK_DEV_MD
+        select MD_RAID6_PQ
        select ASYNC_MEMCPY
        select ASYNC_XOR
        ---help---
@@ -151,34 +152,8 @@ config MD_RAID456
          If unsure, say Y.
-config MD_RAID5_RESHAPE
+config MD_RAID6_PQ
-        bool "Support adding drives to a raid-5 array"
+        tristate
-        depends on MD_RAID456
-        default y
-        ---help---
-          A RAID-5 set can be expanded by adding extra drives. This
-          requires "restriping" the array which means (almost) every
-          block must be written to a different place.
-          This option allows such restriping to be done while the array
-          is online.
-          You will need mdadm version 2.4.1 or later to use this
-          feature safely.  During the early stage of reshape there is
-          a critical section where live data is being over-written.  A
-          crash during this time needs extra care for recovery.  The
-          newer mdadm takes a copy of the data in the critical section
-          and will restore it, if necessary, after a crash.
-          The mdadm usage is e.g.
-               mdadm --grow /dev/md1 --raid-disks=6
-          to grow '/dev/md1' to having 6 disks.
-          Note: The array can only be expanded, not contracted.
-          There should be enough spares already present to make the new
-          array workable.
-          If unsure, say Y.
 config MD_MULTIPATH
        tristate "Multipath I/O support"
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 72880b7e28d9..45cc5951d928 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -2,20 +2,21 @@
 # Makefile for the kernel software RAID and LVM drivers.
 #
-dm-mod-objs     := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
+dm-mod-y        += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
                   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
-dm-multipath-objs := dm-path-selector.o dm-mpath.o
+dm-multipath-y  += dm-path-selector.o dm-mpath.o
-dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \
+dm-snapshot-y   += dm-snap.o dm-exception-store.o dm-snap-transient.o \
                    dm-snap-persistent.o
-dm-mirror-objs  := dm-raid1.o
+dm-mirror-y     += dm-raid1.o
-md-mod-objs     := md.o bitmap.o
+md-mod-y        += md.o bitmap.o
-raid456-objs    := raid5.o raid6algos.o raid6recov.o raid6tables.o \
+raid456-y       += raid5.o
+raid6_pq-y      += raid6algos.o raid6recov.o raid6tables.o \
                   raid6int1.o raid6int2.o raid6int4.o \
                   raid6int8.o raid6int16.o raid6int32.o \
                   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
                   raid6altivec8.o \
                   raid6mmx.o raid6sse1.o raid6sse2.o
-hostprogs-y     := mktables
+hostprogs-y     += mktables
 # Note: link order is important.  All raid personalities
 # and must come before md.o, as they each initialise 
@@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR)		+= linear.o
 obj-$(CONFIG_MD_RAID0)          += raid0.o
 obj-$(CONFIG_MD_RAID1)          += raid1.o
 obj-$(CONFIG_MD_RAID10)         += raid10.o
+obj-$(CONFIG_MD_RAID6_PQ)       += raid6_pq.o
 obj-$(CONFIG_MD_RAID456)        += raid456.o
 obj-$(CONFIG_MD_MULTIPATH)      += multipath.o
 obj-$(CONFIG_MD_FAULTY)         += faulty.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 719943763391..f8a9f7ab2cb8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -16,6 +16,7 @@
 * wait if count gets too high, wake when it drops to half.
 */
+#include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
@@ -26,8 +27,8 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
-#include <linux/raid/md.h>
+#include "md.h"
-#include <linux/raid/bitmap.h>
+#include "bitmap.h"
 /* debug macros */
@@ -111,9 +112,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat
        unsigned char *mappage;
        if (page >= bitmap->pages) {
-                printk(KERN_ALERT
+                /* This can happen if bitmap_start_sync goes beyond
-                        "%s: invalid bitmap page request: %lu (> %lu)\n",
+                 * End-of-device while looking for a whole page.
-                        bmname(bitmap), page, bitmap->pages-1);
+                 * It is harmless.
+                 */
                return -EINVAL;
        }
@@ -265,7 +267,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
        list_for_each_continue_rcu(pos, &mddev->disks) {
                rdev = list_entry(pos, mdk_rdev_t, same_set);
                if (rdev->raid_disk >= 0 &&
-                    test_bit(In_sync, &rdev->flags) &&
                    !test_bit(Faulty, &rdev->flags)) {
                        /* this is a usable devices */
                        atomic_inc(&rdev->nr_pending);
@@ -297,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
                                    + size/512 > 0)
                                        /* bitmap runs in to metadata */
                                        goto bad_alignment;
-                                if (rdev->data_offset + mddev->size*2
+                                if (rdev->data_offset + mddev->dev_sectors
                                    > rdev->sb_start + bitmap->offset)
                                        /* data runs in to bitmap */
                                        goto bad_alignment;
@@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
        else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
                 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
                reason = "unrecognized superblock version";
-        else if (chunksize < PAGE_SIZE)
+        else if (chunksize < 512)
                reason = "bitmap chunksize too small";
        else if ((1 << ffz(~chunksize)) != chunksize)
                reason = "bitmap chunksize not a power of 2";
@@ -1306,6 +1307,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
                PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
                  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
        }
+        if (bitmap->mddev->degraded)
+                /* Never clear bits or update events_cleared when degraded */
+                success = 0;
        while (sectors) {
                int blocks;
@@ -1345,8 +1349,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
        }
 }
-int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
+static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
-                        int degraded)
+                               int degraded)
 {
        bitmap_counter_t *bmc;
        int rv;
@@ -1374,6 +1378,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
        return rv;
 }
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
+                      int degraded)
+{
+        /* bitmap_start_sync must always report on multiples of whole
+         * pages, otherwise resync (which is very PAGE_SIZE based) will
+         * get confused.
+         * So call __bitmap_start_sync repeatedly (if needed) until
+         * At least PAGE_SIZE>>9 blocks are covered.
+         * Return the 'or' of the result.
+         */
+        int rv = 0;
+        int blocks1;
+        *blocks = 0;
+        while (*blocks < (PAGE_SIZE>>9)) {
+                rv |= __bitmap_start_sync(bitmap, offset,
+                                          &blocks1, degraded);
+                offset += blocks1;
+                *blocks += blocks1;
+        }
+        return rv;
+}
 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted)
 {
        bitmap_counter_t *bmc;
@@ -1443,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
        wait_event(bitmap->mddev->recovery_wait,
                   atomic_read(&bitmap->mddev->recovery_active) == 0);
+        bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
+        set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
        sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
        s = 0;
        while (s < sector && s < bitmap->mddev->resync_max_sectors) {
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
new file mode 100644
index 000000000000..e98900671ca9
--- /dev/null
+++ b/drivers/md/bitmap.h
@@ -0,0 +1,288 @@
+/*
+ * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
+ *
+ * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+ */
+#ifndef BITMAP_H
+#define BITMAP_H 1
+#define BITMAP_MAJOR_LO 3
+/* version 4 insists the bitmap is in little-endian order
+ * with version 3, it is host-endian which is non-portable
+ */
+#define BITMAP_MAJOR_HI 4
+#define BITMAP_MAJOR_HOSTENDIAN 3
+#define BITMAP_MINOR 39
+/*
+ * in-memory bitmap:
+ *
+ * Use 16 bit block counters to track pending writes to each "chunk".
+ * The 2 high order bits are special-purpose, the first is a flag indicating
+ * whether a resync is needed.  The second is a flag indicating whether a
+ * resync is active.
+ * This means that the counter is actually 14 bits:
+ *
+ * +--------+--------+------------------------------------------------+
+ * | resync | resync |               counter                          |
+ * | needed | active |                                                |
+ * |  (0-1) |  (0-1) |              (0-16383)                         |
+ * +--------+--------+------------------------------------------------+
+ *
+ * The "resync needed" bit is set when:
+ *    a '1' bit is read from storage at startup.
+ *    a write request fails on some drives
+ *    a resync is aborted on a chunk with 'resync active' set
+ * It is cleared (and resync-active set) when a resync starts across all drives
+ * of the chunk.
+ *
+ *
+ * The "resync active" bit is set when:
+ *    a resync is started on all drives, and resync_needed is set.
+ *       resync_needed will be cleared (as long as resync_active wasn't already set).
+ * It is cleared when a resync completes.
+ *
+ * The counter counts pending write requests, plus the on-disk bit.
+ * When the counter is '1' and the resync bits are clear, the on-disk
+ * bit can be cleared aswell, thus setting the counter to 0.
+ * When we set a bit, or in the counter (to start a write), if the fields is
+ * 0, we first set the disk bit and set the counter to 1.
+ *
+ * If the counter is 0, the on-disk bit is clear and the stipe is clean
+ * Anything that dirties the stipe pushes the counter to 2 (at least)
+ * and sets the on-disk bit (lazily).
+ * If a periodic sweep find the counter at 2, it is decremented to 1.
+ * If the sweep find the counter at 1, the on-disk bit is cleared and the
+ * counter goes to zero.
+ *
+ * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
+ * counters as a fallback when "page" memory cannot be allocated:
+ *
+ * Normal case (page memory allocated):
+ *
+ *     page pointer (32-bit)
+ *
+ *     [ ] ------+
+ *               |
+ *               +-------> [   ][   ]..[   ] (4096 byte page == 2048 counters)
+ *                          c1   c2    c2048
+ *
+ * Hijacked case (page memory allocation failed):
+ *
+ *     hijacked page pointer (32-bit)
+ *
+ *     [                  ][              ] (no page memory allocated)
+ *      counter #1 (16-bit) counter #2 (16-bit)
+ *
+ */
+#ifdef __KERNEL__
+#define PAGE_BITS (PAGE_SIZE << 3)
+#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+typedef __u16 bitmap_counter_t;
+#define COUNTER_BITS 16
+#define COUNTER_BIT_SHIFT 4
+#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
+#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
+#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
+#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
+#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
+#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
+#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
+#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
+/* how many counters per page? */
+#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
+/* same, except a shift value for more efficient bitops */
+#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
+/* same, except a mask value for more efficient bitops */
+#define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)
+#define BITMAP_BLOCK_SIZE 512
+#define BITMAP_BLOCK_SHIFT 9
+/* how many blocks per chunk? (this is variable) */
+#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
+/* when hijacked, the counters and bits represent even larger "chunks" */
+/* there will be 1024 chunks represented by each counter in the page pointers */
+#define PAGEPTR_BLOCK_RATIO(bitmap) \
+                        (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
+#define PAGEPTR_BLOCK_SHIFT(bitmap) \
+                        (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
+#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
+/*
+ * on-disk bitmap:
+ *
+ * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
+ * file a page at a time. There's a superblock at the start of the file.
+ */
+/* map chunks (bits) to file pages - offset by the size of the superblock */
+#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
+#endif
+/*
+ * bitmap structures:
+ */
+#define BITMAP_MAGIC 0x6d746962
+/* use these for bitmap->flags and bitmap->sb->state bit-fields */
+enum bitmap_state {
+        BITMAP_STALE  = 0x002,  /* the bitmap file is out of date or had -EIO */
+        BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
+        BITMAP_HOSTENDIAN = 0x8000,
+};
+/* the superblock at the front of the bitmap file -- little endian */
+typedef struct bitmap_super_s {
+        __le32 magic;        /*  0  BITMAP_MAGIC */
+        __le32 version;      /*  4  the bitmap major for now, could change... */
+        __u8  uuid[16];      /*  8  128 bit uuid - must match md device uuid */
+        __le64 events;       /* 24  event counter for the bitmap (1)*/
+        __le64 events_cleared;/*32  event counter when last bit cleared (2) */
+        __le64 sync_size;    /* 40  the size of the md device's sync range(3) */
+        __le32 state;        /* 48  bitmap state information */
+        __le32 chunksize;    /* 52  the bitmap chunk size in bytes */
+        __le32 daemon_sleep; /* 56  seconds between disk flushes */
+        __le32 write_behind; /* 60  number of outstanding write-behind writes */
+        __u8  pad[256 - 64]; /* set to zero */
+} bitmap_super_t;
+/* notes:
+ * (1) This event counter is updated before the eventcounter in the md superblock
+ *    When a bitmap is loaded, it is only accepted if this event counter is equal
+ *    to, or one greater than, the event counter in the superblock.
+ * (2) This event counter is updated when the other one is *if*and*only*if* the
+ *    array is not degraded.  As bits are not cleared when the array is degraded,
+ *    this represents the last time that any bits were cleared.
+ *    If a device is being added that has an event count with this value or
+ *    higher, it is accepted as conforming to the bitmap.
+ * (3)This is the number of sectors represented by the bitmap, and is the range that
+ *    resync happens across.  For raid1 and raid5/6 it is the size of individual
+ *    devices.  For raid10 it is the size of the array.
+ */
+#ifdef __KERNEL__
+/* the in-memory bitmap is represented by bitmap_pages */
+struct bitmap_page {
+        /*
+         * map points to the actual memory page
+         */
+        char *map;
+        /*
+         * in emergencies (when map cannot be alloced), hijack the map
+         * pointer and use it as two counters itself
+         */
+        unsigned int hijacked:1;
+        /*
+         * count of dirty bits on the page
+         */
+        unsigned int  count:31;
+};
+/* keep track of bitmap file pages that have pending writes on them */
+struct page_list {
+        struct list_head list;
+        struct page *page;
+};
+/* the main bitmap structure - one per mddev */
+struct bitmap {
+        struct bitmap_page *bp;
+        unsigned long pages; /* total number of pages in the bitmap */
+        unsigned long missing_pages; /* number of pages not yet allocated */
+        mddev_t *mddev; /* the md device that the bitmap is for */
+        int counter_bits; /* how many bits per block counter */
+        /* bitmap chunksize -- how much data does each bit represent? */
+        unsigned long chunksize;
+        unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
+        unsigned long chunks; /* total number of data chunks for the array */
+        /* We hold a count on the chunk currently being synced, and drop
+         * it when the last block is started.  If the resync is aborted
+         * midway, we need to be able to drop that count, so we remember
+         * the counted chunk..
+         */
+        unsigned long syncchunk;
+        __u64   events_cleared;
+        int need_sync;
+        /* bitmap spinlock */
+        spinlock_t lock;
+        long offset; /* offset from superblock if file is NULL */
+        struct file *file; /* backing disk file */
+        struct page *sb_page; /* cached copy of the bitmap file superblock */
+        struct page **filemap; /* list of cache pages for the file */
+        unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
+        unsigned long file_pages; /* number of pages in the file */
+        int last_page_size; /* bytes in the last page */
+        unsigned long flags;
+        int allclean;
+        unsigned long max_write_behind; /* write-behind mode */
+        atomic_t behind_writes;
+        /*
+         * the bitmap daemon - periodically wakes up and sweeps the bitmap
+         * file, cleaning up bits and flushing out pages to disk as necessary
+         */
+        unsigned long daemon_lastrun; /* jiffies of last run */
+        unsigned long daemon_sleep; /* how many seconds between updates? */
+        unsigned long last_end_sync; /* when we lasted called end_sync to
+                                      * update bitmap with resync progress */
+        atomic_t pending_writes; /* pending writes to the bitmap file */
+        wait_queue_head_t write_wait;
+        wait_queue_head_t overflow_wait;
+};
+/* the bitmap API */
+/* these are used only by md/bitmap */
+int  bitmap_create(mddev_t *mddev);
+void bitmap_flush(mddev_t *mddev);
+void bitmap_destroy(mddev_t *mddev);
+void bitmap_print_sb(struct bitmap *bitmap);
+void bitmap_update_sb(struct bitmap *bitmap);
+int  bitmap_setallbits(struct bitmap *bitmap);
+void bitmap_write_all(struct bitmap *bitmap);
+void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
+/* these are exported */
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
+                        unsigned long sectors, int behind);
+void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
+                        unsigned long sectors, int success, int behind);
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
+void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
+void bitmap_close_sync(struct bitmap *bitmap);
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
+void bitmap_unplug(struct bitmap *bitmap);
+void bitmap_daemon_work(struct bitmap *bitmap);
+#endif
+#endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 86d9adf90e79..8695809b24b0 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -62,7 +62,10 @@
 #define ModeShift       5
 #define MaxFault        50
-#include <linux/raid/md.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include "md.h"
+#include <linux/seq_file.h>
 static void faulty_fail(struct bio *bio, int error)
@@ -280,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
        return 0;
 }
+static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+        WARN_ONCE(raid_disks,
+                  "%s does not support generic reshape\n", __func__);
+        if (sectors == 0)
+                return mddev->dev_sectors;
+        return sectors;
+}
 static int run(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
@@ -298,7 +312,7 @@ static int run(mddev_t *mddev)
        list_for_each_entry(rdev, &mddev->disks, same_set)
                conf->rdev = rdev;
-        mddev->array_sectors = mddev->size * 2;
+        md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
        mddev->private = conf;
        reconfig(mddev, mddev->layout, -1);
@@ -325,6 +339,7 @@ static struct mdk_personality faulty_personality =
        .stop           = stop,
        .status         = status,
        .reconfig       = reconfig,
+        .size           = faulty_size,
 };
 static int __init raid_init(void)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 09658b218474..7a36e38393a1 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -16,7 +16,11 @@
   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
-#include <linux/raid/linear.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "linear.h"
 /*
 * find which device holds a particular offset 
@@ -97,6 +101,16 @@ static int linear_congested(void *data, int bits)
        return ret;
 }
+static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+        linear_conf_t *conf = mddev_to_conf(mddev);
+        WARN_ONCE(sectors || raid_disks,
+                  "%s does not support generic reshape\n", __func__);
+        return conf->array_sectors;
+}
 static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 {
        linear_conf_t *conf;
@@ -135,8 +149,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-                disk->num_sectors = rdev->size * 2;
+                disk->num_sectors = rdev->sectors;
-                conf->array_sectors += rdev->size * 2;
+                conf->array_sectors += rdev->sectors;
                cnt++;
        }
@@ -249,7 +263,7 @@ static int linear_run (mddev_t *mddev)
        if (!conf)
                return 1;
        mddev->private = conf;
-        mddev->array_sectors = conf->array_sectors;
+        md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
        blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
        mddev->queue->unplug_fn = linear_unplug;
@@ -283,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
        newconf->prev = mddev_to_conf(mddev);
        mddev->private = newconf;
        mddev->raid_disks++;
-        mddev->array_sectors = newconf->array_sectors;
+        md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
        set_capacity(mddev->gendisk, mddev->array_sectors);
        return 0;
 }
@@ -381,6 +395,7 @@ static struct mdk_personality linear_personality =
        .stop           = linear_stop,
        .status         = linear_status,
        .hot_add_disk   = linear_add,
+        .size           = linear_size,
 };
 static int __init linear_init (void)
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
new file mode 100644
index 000000000000..bf8179587f95
--- /dev/null
+++ b/drivers/md/linear.h
@@ -0,0 +1,29 @@
+#ifndef _LINEAR_H
+#define _LINEAR_H
+struct dev_info {
+        mdk_rdev_t      *rdev;
+        sector_t        num_sectors;
+        sector_t        start_sector;
+};
+typedef struct dev_info dev_info_t;
+struct linear_private_data
+{
+        struct linear_private_data *prev;       /* earlier version */
+        dev_info_t              **hash_table;
+        sector_t                spacing;
+        sector_t                array_sectors;
+        int                     sector_shift;   /* shift before dividing
+                                                 * by spacing
+                                                 */
+        dev_info_t              disks[0];
+};
+typedef struct linear_private_data linear_conf_t;
+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
+#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a307f87eb90e..ed5727c089a9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,9 +33,9 @@
 */
 #include <linux/kthread.h>
-#include <linux/raid/md.h>
+#include <linux/blkdev.h>
-#include <linux/raid/bitmap.h>
 #include <linux/sysctl.h>
+#include <linux/seq_file.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/poll.h>
 #include <linux/ctype.h>
@@ -45,11 +45,10 @@
 #include <linux/reboot.h>
 #include <linux/file.h>
 #include <linux/delay.h>
+#include <linux/raid/md_p.h>
-#define MAJOR_NR MD_MAJOR
+#include <linux/raid/md_u.h>
+#include "md.h"
-/* 63 partitions with the alternate major number (mdp) */
+#include "bitmap.h"
-#define MdpMinorShift 6
 #define DEBUG 0
 #define dprintk(x...) ((void)(DEBUG && printk(x)))
@@ -202,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
                )
-static int md_fail_request(struct request_queue *q, struct bio *bio)
+/* Rather than calling directly into the personality make_request function,
+ * IO requests come here first so that we can check if the device is
+ * being suspended pending a reconfiguration.
+ * We hold a refcount over the call to ->make_request.  By the time that
+ * call has finished, the bio has been linked into some internal structure
+ * and so is visible to ->quiesce(), so we don't need the refcount any more.
+ */
+static int md_make_request(struct request_queue *q, struct bio *bio)
 {
-        bio_io_error(bio);
+        mddev_t *mddev = q->queuedata;
-        return 0;
+        int rv;
+        if (mddev == NULL || mddev->pers == NULL) {
+                bio_io_error(bio);
+                return 0;
+        }
+        rcu_read_lock();
+        if (mddev->suspended) {
+                DEFINE_WAIT(__wait);
+                for (;;) {
+                        prepare_to_wait(&mddev->sb_wait, &__wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        if (!mddev->suspended)
+                                break;
+                        rcu_read_unlock();
+                        schedule();
+                        rcu_read_lock();
+                }
+                finish_wait(&mddev->sb_wait, &__wait);
+        }
+        atomic_inc(&mddev->active_io);
+        rcu_read_unlock();
+        rv = mddev->pers->make_request(q, bio);
+        if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
+                wake_up(&mddev->sb_wait);
+        return rv;
+}
+static void mddev_suspend(mddev_t *mddev)
+{
+        BUG_ON(mddev->suspended);
+        mddev->suspended = 1;
+        synchronize_rcu();
+        wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
+        mddev->pers->quiesce(mddev, 1);
+        md_unregister_thread(mddev->thread);
+        mddev->thread = NULL;
+        /* we now know that no code is executing in the personality module,
+         * except possibly the tail end of a ->bi_end_io function, but that
+         * is certain to complete before the module has a chance to get
+         * unloaded
+         */
+}
+static void mddev_resume(mddev_t *mddev)
+{
+        mddev->suspended = 0;
+        wake_up(&mddev->sb_wait);
+        mddev->pers->quiesce(mddev, 0);
 }
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
        atomic_inc(&mddev->active);
@@ -310,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit)
        init_timer(&new->safemode_timer);
        atomic_set(&new->active, 1);
        atomic_set(&new->openers, 0);
+        atomic_set(&new->active_io, 0);
        spin_lock_init(&new->write_lock);
        init_waitqueue_head(&new->sb_wait);
        init_waitqueue_head(&new->recovery_wait);
@@ -326,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev)
        return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
+static inline int mddev_is_locked(mddev_t *mddev)
+{
+        return mutex_is_locked(&mddev->reconfig_mutex);
+}
 static inline int mddev_trylock(mddev_t * mddev)
 {
        return mutex_trylock(&mddev->reconfig_mutex);
@@ -409,7 +470,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
                rdev->sb_loaded = 0;
                rdev->sb_page = NULL;
                rdev->sb_start = 0;
-                rdev->size = 0;
+                rdev->sectors = 0;
        }
 }
@@ -775,9 +836,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
                else 
                        ret = 0;
        }
-        rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2;
+        rdev->sectors = calc_num_sectors(rdev, sb->chunk_size);
-        if (rdev->size < sb->size && sb->level > 1)
+        if (rdev->sectors < sb->size * 2 && sb->level > 1)
                /* "this cannot possibly happen" ... */
                ret = -EINVAL;
@@ -812,7 +873,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->clevel[0] = 0;
                mddev->layout = sb->layout;
                mddev->raid_disks = sb->raid_disks;
-                mddev->size = sb->size;
+                mddev->dev_sectors = sb->size * 2;
                mddev->events = ev1;
                mddev->bitmap_offset = 0;
                mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
@@ -926,7 +987,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        sb->ctime = mddev->ctime;
        sb->level = mddev->level;
-        sb->size  = mddev->size;
+        sb->size = mddev->dev_sectors / 2;
        sb->raid_disks = mddev->raid_disks;
        sb->md_minor = mddev->md_minor;
        sb->not_persistent = 0;
@@ -1024,7 +1085,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 static unsigned long long
 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 {
-        if (num_sectors && num_sectors < rdev->mddev->size * 2)
+        if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
                return 0; /* component must fit device */
        if (rdev->mddev->bitmap_offset)
                return 0; /* can't move bitmap */
@@ -1180,16 +1241,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
                        ret = 0;
        }
        if (minor_version)
-                rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+                rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
+                        le64_to_cpu(sb->data_offset);
        else
-                rdev->size = rdev->sb_start / 2;
+                rdev->sectors = rdev->sb_start;
-        if (rdev->size < le64_to_cpu(sb->data_size)/2)
+        if (rdev->sectors < le64_to_cpu(sb->data_size))
                return -EINVAL;
-        rdev->size = le64_to_cpu(sb->data_size)/2;
+        rdev->sectors = le64_to_cpu(sb->data_size);
        if (le32_to_cpu(sb->chunksize))
-                rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+                rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1);
-        if (le64_to_cpu(sb->size) > rdev->size*2)
+        if (le64_to_cpu(sb->size) > rdev->sectors)
                return -EINVAL;
        return ret;
 }
@@ -1216,7 +1278,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->clevel[0] = 0;
                mddev->layout = le32_to_cpu(sb->layout);
                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
-                mddev->size = le64_to_cpu(sb->size)/2;
+                mddev->dev_sectors = le64_to_cpu(sb->size);
                mddev->events = ev1;
                mddev->bitmap_offset = 0;
                mddev->default_bitmap_offset = 1024 >> 9;
@@ -1312,7 +1374,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
        sb->raid_disks = cpu_to_le32(mddev->raid_disks);
-        sb->size = cpu_to_le64(mddev->size<<1);
+        sb->size = cpu_to_le64(mddev->dev_sectors);
        if (mddev->bitmap && mddev->bitmap_file == NULL) {
                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
@@ -1320,10 +1382,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        }
        if (rdev->raid_disk >= 0 &&
-            !test_bit(In_sync, &rdev->flags) &&
+            !test_bit(In_sync, &rdev->flags)) {
-            rdev->recovery_offset > 0) {
+                if (mddev->curr_resync_completed > rdev->recovery_offset)
-                sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+                        rdev->recovery_offset = mddev->curr_resync_completed;
-                sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
+                if (rdev->recovery_offset > 0) {
+                        sb->feature_map |=
+                                cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+                        sb->recovery_offset =
+                                cpu_to_le64(rdev->recovery_offset);
+                }
        }
        if (mddev->reshape_position != MaxSector) {
@@ -1365,7 +1432,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 {
        struct mdp_superblock_1 *sb;
        sector_t max_sectors;
-        if (num_sectors && num_sectors < rdev->mddev->size * 2)
+        if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
                return 0; /* component must fit device */
        if (rdev->sb_start < rdev->data_offset) {
                /* minor versions 1 and 2; superblock before data */
@@ -1381,7 +1448,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
                sector_t sb_start;
                sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
                sb_start &= ~(sector_t)(4*2 - 1);
-                max_sectors = rdev->size * 2 + sb_start - rdev->sb_start;
+                max_sectors = rdev->sectors + sb_start - rdev->sb_start;
                if (!num_sectors || num_sectors > max_sectors)
                        num_sectors = max_sectors;
                rdev->sb_start = sb_start;
@@ -1433,6 +1500,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
 static LIST_HEAD(pending_raid_disks);
+static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
+{
+        struct mdk_personality *pers = mddev->pers;
+        struct gendisk *disk = mddev->gendisk;
+        struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
+        struct blk_integrity *bi_mddev = blk_get_integrity(disk);
+        /* Data integrity passthrough not supported on RAID 4, 5 and 6 */
+        if (pers && pers->level >= 4 && pers->level <= 6)
+                return;
+        /* If rdev is integrity capable, register profile for mddev */
+        if (!bi_mddev && bi_rdev) {
+                if (blk_integrity_register(disk, bi_rdev))
+                        printk(KERN_ERR "%s: %s Could not register integrity!\n",
+                               __func__, disk->disk_name);
+                else
+                        printk(KERN_NOTICE "Enabling data integrity on %s\n",
+                               disk->disk_name);
+                return;
+        }
+        /* Check that mddev and rdev have matching profiles */
+        if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
+                printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
+                       disk->disk_name, rdev->bdev->bd_disk->disk_name);
+                printk(KERN_NOTICE "Disabling data integrity on %s\n",
+                       disk->disk_name);
+                blk_integrity_unregister(disk);
+        }
+}
 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 {
        char b[BDEVNAME_SIZE];
@@ -1449,8 +1548,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
        if (find_rdev(mddev, rdev->bdev->bd_dev))
                return -EEXIST;
-        /* make sure rdev->size exceeds mddev->size */
+        /* make sure rdev->sectors exceeds mddev->dev_sectors */
-        if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
+        if (rdev->sectors && (mddev->dev_sectors == 0 ||
+                        rdev->sectors < mddev->dev_sectors)) {
                if (mddev->pers) {
                        /* Cannot change size, so fail
                         * If mddev->level <= 0, then we don't care
@@ -1459,7 +1559,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
                        if (mddev->level > 0)
                                return -ENOSPC;
                } else
-                        mddev->size = rdev->size;
+                        mddev->dev_sectors = rdev->sectors;
        }
        /* Verify rdev->desc_nr is unique.
@@ -1503,6 +1603,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
        /* May as well allow recovery to be retried once */
        mddev->recovery_disabled = 0;
+        md_integrity_check(rdev, mddev);
        return 0;
 fail:
@@ -1713,8 +1815,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
 static void print_rdev(mdk_rdev_t *rdev, int major_version)
 {
        char b[BDEVNAME_SIZE];
-        printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
+        printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
-                bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
+                bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
                test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
                rdev->desc_nr);
        if (rdev->sb_loaded) {
@@ -2153,7 +2255,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                return -EINVAL;
        if (rdev->mddev->pers && rdev->raid_disk >= 0)
                return -EBUSY;
-        if (rdev->size && rdev->mddev->external)
+        if (rdev->sectors && rdev->mddev->external)
                /* Must set offset before size, so overlap checks
                 * can be sane */
                return -EBUSY;
@@ -2167,7 +2269,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
 static ssize_t
 rdev_size_show(mdk_rdev_t *rdev, char *page)
 {
-        return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
+        return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
 }
 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
@@ -2180,34 +2282,52 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
        return 1;
 }
+static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
+{
+        unsigned long long blocks;
+        sector_t new;
+        if (strict_strtoull(buf, 10, &blocks) < 0)
+                return -EINVAL;
+        if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
+                return -EINVAL; /* sector conversion overflow */
+        new = blocks * 2;
+        if (new != blocks * 2)
+                return -EINVAL; /* unsigned long long to sector_t overflow */
+        *sectors = new;
+        return 0;
+}
 static ssize_t
 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
-        unsigned long long size;
-        unsigned long long oldsize = rdev->size;
        mddev_t *my_mddev = rdev->mddev;
+        sector_t oldsectors = rdev->sectors;
+        sector_t sectors;
-        if (strict_strtoull(buf, 10, &size) < 0)
+        if (strict_blocks_to_sectors(buf, &sectors) < 0)
                return -EINVAL;
        if (my_mddev->pers && rdev->raid_disk >= 0) {
                if (my_mddev->persistent) {
-                        size = super_types[my_mddev->major_version].
+                        sectors = super_types[my_mddev->major_version].
-                                rdev_size_change(rdev, size * 2);
+                                rdev_size_change(rdev, sectors);
-                        if (!size)
+                        if (!sectors)
                                return -EBUSY;
-                } else if (!size) {
+                } else if (!sectors)
-                        size = (rdev->bdev->bd_inode->i_size >> 10);
+                        sectors = (rdev->bdev->bd_inode->i_size >> 9) -
-                        size -= rdev->data_offset/2;
+                                rdev->data_offset;
-                }
        }
-        if (size < my_mddev->size)
+        if (sectors < my_mddev->dev_sectors)
                return -EINVAL; /* component must fit device */
-        rdev->size = size;
+        rdev->sectors = sectors;
-        if (size > oldsize && my_mddev->external) {
+        if (sectors > oldsectors && my_mddev->external) {
                /* need to check that all other rdevs with the same ->bdev
                 * do not overlap.  We need to unlock the mddev to avoid
-                 * a deadlock.  We have already changed rdev->size, and if
+                 * a deadlock.  We have already changed rdev->sectors, and if
                 * we have to change it back, we will have the lock again.
                 */
                mddev_t *mddev;
@@ -2223,9 +2343,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                                if (test_bit(AllReserved, &rdev2->flags) ||
                                    (rdev->bdev == rdev2->bdev &&
                                     rdev != rdev2 &&
-                                     overlaps(rdev->data_offset, rdev->size * 2,
+                                     overlaps(rdev->data_offset, rdev->sectors,
                                              rdev2->data_offset,
-                                              rdev2->size * 2))) {
+                                              rdev2->sectors))) {
                                        overlap = 1;
                                        break;
                                }
@@ -2239,11 +2359,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                if (overlap) {
                        /* Someone else could have slipped in a size
                         * change here, but doing so is just silly.
-                         * We put oldsize back because we *know* it is
+                         * We put oldsectors back because we *know* it is
                         * safe, and trust userspace not to race with
                         * itself
                         */
-                        rdev->size = oldsize;
+                        rdev->sectors = oldsectors;
                        return -EBUSY;
                }
        }
@@ -2547,18 +2667,101 @@ level_show(mddev_t *mddev, char *page)
 static ssize_t
 level_store(mddev_t *mddev, const char *buf, size_t len)
 {
+        char level[16];
        ssize_t rv = len;
-        if (mddev->pers)
+        struct mdk_personality *pers;
+        void *priv;
+        if (mddev->pers == NULL) {
+                if (len == 0)
+                        return 0;
+                if (len >= sizeof(mddev->clevel))
+                        return -ENOSPC;
+                strncpy(mddev->clevel, buf, len);
+                if (mddev->clevel[len-1] == '\n')
+                        len--;
+                mddev->clevel[len] = 0;
+                mddev->level = LEVEL_NONE;
+                return rv;
+        }
+        /* request to change the personality.  Need to ensure:
+         *  - array is not engaged in resync/recovery/reshape
+         *  - old personality can be suspended
+         *  - new personality will access other array.
+         */
+        if (mddev->sync_thread || mddev->reshape_position != MaxSector)
                return -EBUSY;
-        if (len == 0)
-                return 0;
+        if (!mddev->pers->quiesce) {
-        if (len >= sizeof(mddev->clevel))
+                printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
-                return -ENOSPC;
+                       mdname(mddev), mddev->pers->name);
-        strncpy(mddev->clevel, buf, len);
+                return -EINVAL;
-        if (mddev->clevel[len-1] == '\n')
+        }
+        /* Now find the new personality */
+        if (len == 0 || len >= sizeof(level))
+                return -EINVAL;
+        strncpy(level, buf, len);
+        if (level[len-1] == '\n')
                len--;
-        mddev->clevel[len] = 0;
+        level[len] = 0;
-        mddev->level = LEVEL_NONE;
+        request_module("md-%s", level);
+        spin_lock(&pers_lock);
+        pers = find_pers(LEVEL_NONE, level);
+        if (!pers || !try_module_get(pers->owner)) {
+                spin_unlock(&pers_lock);
+                printk(KERN_WARNING "md: personality %s not loaded\n", level);
+                return -EINVAL;
+        }
+        spin_unlock(&pers_lock);
+        if (pers == mddev->pers) {
+                /* Nothing to do! */
+                module_put(pers->owner);
+                return rv;
+        }
+        if (!pers->takeover) {
+                module_put(pers->owner);
+                printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
+                       mdname(mddev), level);
+                return -EINVAL;
+        }
+        /* ->takeover must set new_* and/or delta_disks
+         * if it succeeds, and may set them when it fails.
+         */
+        priv = pers->takeover(mddev);
+        if (IS_ERR(priv)) {
+                mddev->new_level = mddev->level;
+                mddev->new_layout = mddev->layout;
+                mddev->new_chunk = mddev->chunk_size;
+                mddev->raid_disks -= mddev->delta_disks;
+                mddev->delta_disks = 0;
+                module_put(pers->owner);
+                printk(KERN_WARNING "md: %s: %s would not accept array\n",
+                       mdname(mddev), level);
+                return PTR_ERR(priv);
+        }
+        /* Looks like we have a winner */
+        mddev_suspend(mddev);
+        mddev->pers->stop(mddev);
+        module_put(mddev->pers->owner);
+        mddev->pers = pers;
+        mddev->private = priv;
+        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+        mddev->level = mddev->new_level;
+        mddev->layout = mddev->new_layout;
+        mddev->chunk_size = mddev->new_chunk;
+        mddev->delta_disks = 0;
+        pers->run(mddev);
+        mddev_resume(mddev);
+        set_bit(MD_CHANGE_DEVS, &mddev->flags);
+        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+        md_wakeup_thread(mddev->thread);
        return rv;
 }
@@ -2586,12 +2789,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
        if (!*buf || (*e && *e != '\n'))
                return -EINVAL;
-        if (mddev->pers)
+        if (mddev->pers) {
-                return -EBUSY;
+                int err;
-        if (mddev->reshape_position != MaxSector)
+                if (mddev->pers->reconfig == NULL)
+                        return -EBUSY;
+                err = mddev->pers->reconfig(mddev, n, -1);
+                if (err)
+                        return err;
+        } else {
                mddev->new_layout = n;
-        else
+                if (mddev->reshape_position == MaxSector)
-                mddev->layout = n;
+                        mddev->layout = n;
+        }
        return len;
 }
 static struct md_sysfs_entry md_layout =
@@ -2648,19 +2857,24 @@ chunk_size_show(mddev_t *mddev, char *page)
 static ssize_t
 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 {
-        /* can only set chunk_size if array is not yet active */
        char *e;
        unsigned long n = simple_strtoul(buf, &e, 10);
        if (!*buf || (*e && *e != '\n'))
                return -EINVAL;
-        if (mddev->pers)
+        if (mddev->pers) {
-                return -EBUSY;
+                int err;
-        else if (mddev->reshape_position != MaxSector)
+                if (mddev->pers->reconfig == NULL)
+                        return -EBUSY;
+                err = mddev->pers->reconfig(mddev, -1, n);
+                if (err)
+                        return err;
+        } else {
                mddev->new_chunk = n;
-        else
+                if (mddev->reshape_position == MaxSector)
-                mddev->chunk_size = n;
+                        mddev->chunk_size = n;
+        }
        return len;
 }
 static struct md_sysfs_entry md_chunk_size =
@@ -2669,6 +2883,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
 static ssize_t
 resync_start_show(mddev_t *mddev, char *page)
 {
+        if (mddev->recovery_cp == MaxSector)
+                return sprintf(page, "none\n");
        return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
 }
@@ -2766,7 +2982,7 @@ array_state_show(mddev_t *mddev, char *page)
        else {
                if (list_empty(&mddev->disks) &&
                    mddev->raid_disks == 0 &&
-                    mddev->size == 0)
+                    mddev->dev_sectors == 0)
                        st = clear;
                else
                        st = inactive;
@@ -2973,7 +3189,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
 static ssize_t
 size_show(mddev_t *mddev, char *page)
 {
-        return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
+        return sprintf(page, "%llu\n",
+                (unsigned long long)mddev->dev_sectors / 2);
 }
 static int update_size(mddev_t *mddev, sector_t num_sectors);
@@ -2985,20 +3202,18 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
         * not increase it (except from 0).
         * If array is active, we can try an on-line resize
         */
-        char *e;
+        sector_t sectors;
-        int err = 0;
+        int err = strict_blocks_to_sectors(buf, &sectors);
-        unsigned long long size = simple_strtoull(buf, &e, 10);
-        if (!*buf || *buf == '\n' ||
-            (*e && *e != '\n'))
-                return -EINVAL;
+        if (err < 0)
+                return err;
        if (mddev->pers) {
-                err = update_size(mddev, size * 2);
+                err = update_size(mddev, sectors);
                md_update_sb(mddev, 1);
        } else {
-                if (mddev->size == 0 ||
+                if (mddev->dev_sectors == 0 ||
-                    mddev->size > size)
+                    mddev->dev_sectors > sectors)
-                        mddev->size = size;
+                        mddev->dev_sectors = sectors;
                else
                        err = -ENOSPC;
        }
@@ -3251,6 +3466,8 @@ static ssize_t
 sync_speed_show(mddev_t *mddev, char *page)
 {
        unsigned long resync, dt, db;
+        if (mddev->curr_resync == 0)
+                return sprintf(page, "none\n");
        resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
        dt = (jiffies - mddev->resync_mark) / HZ;
        if (!dt) dt++;
@@ -3263,15 +3480,15 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
 static ssize_t
 sync_completed_show(mddev_t *mddev, char *page)
 {
-        unsigned long max_blocks, resync;
+        unsigned long max_sectors, resync;
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-                max_blocks = mddev->resync_max_sectors;
+                max_sectors = mddev->resync_max_sectors;
        else
-                max_blocks = mddev->size << 1;
+                max_sectors = mddev->dev_sectors;
        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
-        return sprintf(page, "%lu / %lu\n", resync, max_blocks);
+        return sprintf(page, "%lu / %lu\n", resync, max_sectors);
 }
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -3431,6 +3648,57 @@ static struct md_sysfs_entry md_reshape_position =
 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
       reshape_position_store);
+static ssize_t
+array_size_show(mddev_t *mddev, char *page)
+{
+        if (mddev->external_size)
+                return sprintf(page, "%llu\n",
+                               (unsigned long long)mddev->array_sectors/2);
+        else
+                return sprintf(page, "default\n");
+}
+static ssize_t
+array_size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+        sector_t sectors;
+        if (strncmp(buf, "default", 7) == 0) {
+                if (mddev->pers)
+                        sectors = mddev->pers->size(mddev, 0, 0);
+                else
+                        sectors = mddev->array_sectors;
+                mddev->external_size = 0;
+        } else {
+                if (strict_blocks_to_sectors(buf, &sectors) < 0)
+                        return -EINVAL;
+                if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
+                        return -EINVAL;
+                mddev->external_size = 1;
+        }
+        mddev->array_sectors = sectors;
+        set_capacity(mddev->gendisk, mddev->array_sectors);
+        if (mddev->pers) {
+                struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
+                if (bdev) {
+                        mutex_lock(&bdev->bd_inode->i_mutex);
+                        i_size_write(bdev->bd_inode,
+                                     (loff_t)mddev->array_sectors << 9);
+                        mutex_unlock(&bdev->bd_inode->i_mutex);
+                        bdput(bdev);
+                }
+        }
+        return len;
+}
+static struct md_sysfs_entry md_array_size =
+__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
+       array_size_store);
 static struct attribute *md_default_attrs[] = {
        &md_level.attr,
@@ -3444,6 +3712,7 @@ static struct attribute *md_default_attrs[] = {
        &md_safe_delay.attr,
        &md_array_state.attr,
        &md_reshape_position.attr,
+        &md_array_size.attr,
        NULL,
 };
@@ -3602,10 +3871,12 @@ static int md_alloc(dev_t dev, char *name)
                mddev_put(mddev);
                return -ENOMEM;
        }
+        mddev->queue->queuedata = mddev;
        /* Can be unlocked because the queue is new: no concurrency */
        queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
-        blk_queue_make_request(mddev->queue, md_fail_request);
+        blk_queue_make_request(mddev->queue, md_make_request);
        disk = alloc_disk(1 << shift);
        if (!disk) {
@@ -3731,13 +4002,13 @@ static int do_md_run(mddev_t * mddev)
                list_for_each_entry(rdev, &mddev->disks, same_set) {
                        if (test_bit(Faulty, &rdev->flags))
                                continue;
-                        if (rdev->size < chunk_size / 1024) {
+                        if (rdev->sectors < chunk_size / 512) {
                                printk(KERN_WARNING
                                        "md: Dev %s smaller than chunk_size:"
-                                        " %lluk < %dk\n",
+                                        " %llu < %d\n",
                                        bdevname(rdev->bdev,b),
-                                        (unsigned long long)rdev->size,
+                                        (unsigned long long)rdev->sectors,
-                                        chunk_size / 1024);
+                                        chunk_size / 512);
                                return -EINVAL;
                        }
                }
@@ -3761,11 +4032,11 @@ static int do_md_run(mddev_t * mddev)
                /* perform some consistency tests on the device.
                 * We don't want the data to overlap the metadata,
-                 * Internal Bitmap issues has handled elsewhere.
+                 * Internal Bitmap issues have been handled elsewhere.
                 */
                if (rdev->data_offset < rdev->sb_start) {
-                        if (mddev->size &&
+                        if (mddev->dev_sectors &&
-                            rdev->data_offset + mddev->size*2
+                            rdev->data_offset + mddev->dev_sectors
                            > rdev->sb_start) {
                                printk("md: %s: data overlaps metadata\n",
                                       mdname(mddev));
@@ -3801,9 +4072,16 @@ static int do_md_run(mddev_t * mddev)
        }
        mddev->pers = pers;
        spin_unlock(&pers_lock);
-        mddev->level = pers->level;
+        if (mddev->level != pers->level) {
+                mddev->level = pers->level;
+                mddev->new_level = pers->level;
+        }
        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+        if (pers->level >= 4 && pers->level <= 6)
+                /* Cannot support integrity (yet) */
+                blk_integrity_unregister(mddev->gendisk);
        if (mddev->reshape_position != MaxSector &&
            pers->start_reshape == NULL) {
                /* This personality cannot handle reshaping... */
@@ -3843,7 +4121,9 @@ static int do_md_run(mddev_t * mddev)
        }
        mddev->recovery = 0;
-        mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
+        /* may be over-ridden by personality */
+        mddev->resync_max_sectors = mddev->dev_sectors;
        mddev->barriers_work = 1;
        mddev->ok_start_degraded = start_dirty_degraded;
@@ -3853,7 +4133,17 @@ static int do_md_run(mddev_t * mddev)
        err = mddev->pers->run(mddev);
        if (err)
                printk(KERN_ERR "md: pers->run() failed ...\n");
-        else if (mddev->pers->sync_request) {
+        else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
+                WARN_ONCE(!mddev->external_size, "%s: default size too small,"
+                          " but 'external_size' not in effect?\n", __func__);
+                printk(KERN_ERR
+                       "md: invalid array_size %llu > default size %llu\n",
+                       (unsigned long long)mddev->array_sectors / 2,
+                       (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
+                err = -EINVAL;
+                mddev->pers->stop(mddev);
+        }
+        if (err == 0 && mddev->pers->sync_request) {
                err = bitmap_create(mddev);
                if (err) {
                        printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3899,16 +4189,6 @@ static int do_md_run(mddev_t * mddev)
        set_capacity(disk, mddev->array_sectors);
-        /* If we call blk_queue_make_request here, it will
-         * re-initialise max_sectors etc which may have been
-         * refined inside -> run.  So just set the bits we need to set.
-         * Most initialisation happended when we called
-         * blk_queue_make_request(..., md_fail_request)
-         * earlier.
-         */
-        mddev->queue->queuedata = mddev;
-        mddev->queue->make_request_fn = mddev->pers->make_request;
        /* If there is a partially-recovered drive we need to
         * start recovery here.  If we leave it to md_check_recovery,
         * it will remove the drives and not do the right thing
@@ -4038,7 +4318,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                        md_super_wait(mddev);
                        if (mddev->ro)
                                set_disk_ro(disk, 0);
-                        blk_queue_make_request(mddev->queue, md_fail_request);
                        mddev->pers->stop(mddev);
                        mddev->queue->merge_bvec_fn = NULL;
                        mddev->queue->unplug_fn = NULL;
@@ -4095,7 +4375,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                export_array(mddev);
                mddev->array_sectors = 0;
-                mddev->size = 0;
+                mddev->external_size = 0;
+                mddev->dev_sectors = 0;
                mddev->raid_disks = 0;
                mddev->recovery_cp = 0;
                mddev->resync_min = 0;
@@ -4135,6 +4416,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                printk(KERN_INFO "md: %s switched to read-only mode.\n",
                        mdname(mddev));
        err = 0;
+        blk_integrity_unregister(disk);
        md_new_event(mddev);
        sysfs_notify_dirent(mddev->sysfs_state);
 out:
@@ -4300,8 +4582,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
        info.patch_version = MD_PATCHLEVEL_VERSION;
        info.ctime         = mddev->ctime;
        info.level         = mddev->level;
-        info.size          = mddev->size;
+        info.size          = mddev->dev_sectors / 2;
-        if (info.size != mddev->size) /* overflow */
+        if (info.size != mddev->dev_sectors / 2) /* overflow */
                info.size = -1;
        info.nr_disks      = nr;
        info.raid_disks    = mddev->raid_disks;
@@ -4480,6 +4762,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                clear_bit(In_sync, &rdev->flags); /* just to be sure */
                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
                        set_bit(WriteMostly, &rdev->flags);
+                else
+                        clear_bit(WriteMostly, &rdev->flags);
                rdev->raid_disk = -1;
                err = bind_rdev_to_array(rdev, mddev);
@@ -4543,7 +4827,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                        rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
                } else 
                        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
-                rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
+                rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
                err = bind_rdev_to_array(rdev, mddev);
                if (err) {
@@ -4613,7 +4897,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
        else
                rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
-        rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
+        rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
        if (test_bit(Faulty, &rdev->flags)) {
                printk(KERN_WARNING 
@@ -4749,7 +5033,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
        mddev->level         = info->level;
        mddev->clevel[0]     = 0;
-        mddev->size          = info->size;
+        mddev->dev_sectors   = 2 * (sector_t)info->size;
        mddev->raid_disks    = info->raid_disks;
        /* don't set md_minor, it is determined by which /dev/md* was
         * openned
@@ -4788,6 +5072,17 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
        return 0;
 }
+void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
+{
+        WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
+        if (mddev->external_size)
+                return;
+        mddev->array_sectors = array_sectors;
+}
+EXPORT_SYMBOL(md_set_array_sectors);
 static int update_size(mddev_t *mddev, sector_t num_sectors)
 {
        mdk_rdev_t *rdev;
@@ -4814,8 +5109,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
                 */
                return -EBUSY;
        list_for_each_entry(rdev, &mddev->disks, same_set) {
-                sector_t avail;
+                sector_t avail = rdev->sectors;
-                avail = rdev->size * 2;
                if (fit && (num_sectors == 0 || num_sectors > avail))
                        num_sectors = avail;
@@ -4887,12 +5181,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
                )
                return -EINVAL;
        /* Check there is only one change */
-        if (info->size >= 0 && mddev->size != info->size) cnt++;
+        if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
-        if (mddev->raid_disks != info->raid_disks) cnt++;
+                cnt++;
-        if (mddev->layout != info->layout) cnt++;
+        if (mddev->raid_disks != info->raid_disks)
-        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
+                cnt++;
-        if (cnt == 0) return 0;
+        if (mddev->layout != info->layout)
-        if (cnt > 1) return -EINVAL;
+                cnt++;
+        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
+                cnt++;
+        if (cnt == 0)
+                return 0;
+        if (cnt > 1)
+                return -EINVAL;
        if (mddev->layout != info->layout) {
                /* Change layout
@@ -4904,7 +5204,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
                else
                        return mddev->pers->reconfig(mddev, info->layout, -1);
        }
-        if (info->size >= 0 && mddev->size != info->size)
+        if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
                rv = update_size(mddev, (sector_t)info->size * 2);
        if (mddev->raid_disks    != info->raid_disks)
@@ -5331,6 +5631,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
 void md_unregister_thread(mdk_thread_t *thread)
 {
+        if (!thread)
+                return;
        dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
        kthread_stop(thread->tsk);
@@ -5404,7 +5706,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
                max_blocks = mddev->resync_max_sectors >> 1;
        else
-                max_blocks = mddev->size;
+                max_blocks = mddev->dev_sectors / 2;
        /*
         * Should not happen.
@@ -5537,7 +5839,7 @@ struct mdstat_info {
 static int md_seq_show(struct seq_file *seq, void *v)
 {
        mddev_t *mddev = v;
-        sector_t size;
+        sector_t sectors;
        mdk_rdev_t *rdev;
        struct mdstat_info *mi = seq->private;
        struct bitmap *bitmap;
@@ -5573,7 +5875,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                        seq_printf(seq, " %s", mddev->pers->name);
                }
-                size = 0;
+                sectors = 0;
                list_for_each_entry(rdev, &mddev->disks, same_set) {
                        char b[BDEVNAME_SIZE];
                        seq_printf(seq, " %s[%d]",
@@ -5585,7 +5887,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                                continue;
                        } else if (rdev->raid_disk < 0)
                                seq_printf(seq, "(S)"); /* spare */
-                        size += rdev->size;
+                        sectors += rdev->sectors;
                }
                if (!list_empty(&mddev->disks)) {
@@ -5595,7 +5897,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                                           mddev->array_sectors / 2);
                        else
                                seq_printf(seq, "\n      %llu blocks",
-                                           (unsigned long long)size);
+                                           (unsigned long long)sectors / 2);
                }
                if (mddev->persistent) {
                        if (mddev->major_version != 0 ||
@@ -5722,19 +6024,19 @@ int unregister_md_personality(struct mdk_personality *p)
        return 0;
 }
-static int is_mddev_idle(mddev_t *mddev)
+static int is_mddev_idle(mddev_t *mddev, int init)
 {
        mdk_rdev_t * rdev;
        int idle;
-        long curr_events;
+        int curr_events;
        idle = 1;
        rcu_read_lock();
        rdev_for_each_rcu(rdev, mddev) {
                struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
-                curr_events = part_stat_read(&disk->part0, sectors[0]) +
+                curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
-                                part_stat_read(&disk->part0, sectors[1]) -
+                              (int)part_stat_read(&disk->part0, sectors[1]) -
-                                atomic_read(&disk->sync_io);
+                              atomic_read(&disk->sync_io);
                /* sync IO will cause sync_io to increase before the disk_stats
                 * as sync_io is counted when a request starts, and
                 * disk_stats is counted when it completes.
@@ -5757,7 +6059,7 @@ static int is_mddev_idle(mddev_t *mddev)
                 * always make curr_events less than last_events.
                 *
                 */
-                if (curr_events - rdev->last_events > 4096) {
+                if (init || curr_events - rdev->last_events > 64) {
                        rdev->last_events = curr_events;
                        idle = 0;
                }
@@ -5980,10 +6282,10 @@ void md_do_sync(mddev_t *mddev)
                        j = mddev->recovery_cp;
        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-                max_sectors = mddev->size << 1;
+                max_sectors = mddev->dev_sectors;
        else {
                /* recovery follows the physical size of devices */
-                max_sectors = mddev->size << 1;
+                max_sectors = mddev->dev_sectors;
                j = MaxSector;
                list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0 &&
@@ -6000,7 +6302,7 @@ void md_do_sync(mddev_t *mddev)
               "(but not more than %d KB/sec) for %s.\n",
               speed_max(mddev), desc);
-        is_mddev_idle(mddev); /* this also initializes IO event counters */
+        is_mddev_idle(mddev, 1); /* this initializes IO event counters */
        io_sectors = 0;
        for (m = 0; m < SYNC_MARKS; m++) {
@@ -6040,6 +6342,18 @@ void md_do_sync(mddev_t *mddev)
                }
                if (kthread_should_stop())
                        goto interrupted;
+                if (mddev->curr_resync > mddev->curr_resync_completed &&
+                    (mddev->curr_resync - mddev->curr_resync_completed)
+                    > (max_sectors >> 4)) {
+                        /* time to update curr_resync_completed */
+                        blk_unplug(mddev->queue);
+                        wait_event(mddev->recovery_wait,
+                                   atomic_read(&mddev->recovery_active) == 0);
+                        mddev->curr_resync_completed =
+                                mddev->curr_resync;
+                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+                }
                sectors = mddev->pers->sync_request(mddev, j, &skipped,
                                                  currspeed < speed_min(mddev));
                if (sectors == 0) {
@@ -6102,7 +6416,7 @@ void md_do_sync(mddev_t *mddev)
                if (currspeed > speed_min(mddev)) {
                        if ((currspeed > speed_max(mddev)) ||
-                                        !is_mddev_idle(mddev)) {
+                                        !is_mddev_idle(mddev, 0)) {
                                msleep(500);
                                goto repeat;
                        }
@@ -6173,6 +6487,8 @@ static int remove_and_add_spares(mddev_t *mddev)
        mdk_rdev_t *rdev;
        int spares = 0;
+        mddev->curr_resync_completed = 0;
        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0 &&
                    !test_bit(Blocked, &rdev->flags) &&
@@ -6327,6 +6643,9 @@ void md_check_recovery(mddev_t *mddev)
                                        sysfs_notify(&mddev->kobj, NULL,
                                                     "degraded");
                        }
+                        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+                            mddev->pers->finish_reshape)
+                                mddev->pers->finish_reshape(mddev);
                        md_update_sb(mddev, 1);
                        /* if array is no-longer degraded, then any saved_raid_disk
@@ -6470,13 +6789,13 @@ static void md_geninit(void)
 static int __init md_init(void)
 {
-        if (register_blkdev(MAJOR_NR, "md"))
+        if (register_blkdev(MD_MAJOR, "md"))
                return -1;
        if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
-                unregister_blkdev(MAJOR_NR, "md");
+                unregister_blkdev(MD_MAJOR, "md");
                return -1;
        }
-        blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
+        blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
                            md_probe, NULL, NULL);
        blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
                            md_probe, NULL, NULL);
@@ -6562,10 +6881,10 @@ static __exit void md_exit(void)
        mddev_t *mddev;
        struct list_head *tmp;
-        blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
+        blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
        blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
-        unregister_blkdev(MAJOR_NR,"md");
+        unregister_blkdev(MD_MAJOR,"md");
        unregister_blkdev(mdp_major, "mdp");
        unregister_reboot_notifier(&md_notifier);
        unregister_sysctl_table(raid_table_header);
diff --git a/drivers/md/md.h b/drivers/md/md.h
new file mode 100644
index 000000000000..e9b7f54c24d6
--- /dev/null
+++ b/drivers/md/md.h
@@ -0,0 +1,436 @@
+/*
+   md_k.h : kernel internal structure of the Linux MD driver
+          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
+          
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+#ifndef _MD_K_H
+#define _MD_K_H
+#ifdef CONFIG_BLOCK
+#define MaxSector (~(sector_t)0)
+typedef struct mddev_s mddev_t;
+typedef struct mdk_rdev_s mdk_rdev_t;
+/*
+ * options passed in raidrun:
+ */
+/* Currently this must fit in an 'int' */
+#define MAX_CHUNK_SIZE (1<<30)
+/*
+ * MD's 'extended' device
+ */
+struct mdk_rdev_s
+{
+        struct list_head same_set;      /* RAID devices within the same set */
+        sector_t sectors;               /* Device size (in 512bytes sectors) */
+        mddev_t *mddev;                 /* RAID array if running */
+        int last_events;                /* IO event timestamp */
+        struct block_device *bdev;      /* block device handle */
+        struct page     *sb_page;
+        int             sb_loaded;
+        __u64           sb_events;
+        sector_t        data_offset;    /* start of data in array */
+        sector_t        sb_start;       /* offset of the super block (in 512byte sectors) */
+        int             sb_size;        /* bytes in the superblock */
+        int             preferred_minor;        /* autorun support */
+        struct kobject  kobj;
+        /* A device can be in one of three states based on two flags:
+         * Not working:   faulty==1 in_sync==0
+         * Fully working: faulty==0 in_sync==1
+         * Working, but not
+         * in sync with array
+         *                faulty==0 in_sync==0
+         *
+         * It can never have faulty==1, in_sync==1
+         * This reduces the burden of testing multiple flags in many cases
+         */
+        unsigned long   flags;
+#define Faulty          1               /* device is known to have a fault */
+#define In_sync         2               /* device is in_sync with rest of array */
+#define WriteMostly     4               /* Avoid reading if at all possible */
+#define BarriersNotsupp 5               /* BIO_RW_BARRIER is not supported */
+#define AllReserved     6               /* If whole device is reserved for
+                                         * one array */
+#define AutoDetected    7               /* added by auto-detect */
+#define Blocked         8               /* An error occured on an externally
+                                         * managed array, don't allow writes
+                                         * until it is cleared */
+#define StateChanged    9               /* Faulty or Blocked has changed during
+                                         * interrupt, so it needs to be
+                                         * notified by the thread */
+        wait_queue_head_t blocked_wait;
+        int desc_nr;                    /* descriptor index in the superblock */
+        int raid_disk;                  /* role of device in array */
+        int saved_raid_disk;            /* role that device used to have in the
+                                         * array and could again if we did a partial
+                                         * resync from the bitmap
+                                         */
+        sector_t        recovery_offset;/* If this device has been partially
+                                         * recovered, this is where we were
+                                         * up to.
+                                         */
+        atomic_t        nr_pending;     /* number of pending requests.
+                                         * only maintained for arrays that
+                                         * support hot removal
+                                         */
+        atomic_t        read_errors;    /* number of consecutive read errors that
+                                         * we have tried to ignore.
+                                         */
+        atomic_t        corrected_errors; /* number of corrected read errors,
+                                           * for reporting to userspace and storing
+                                           * in superblock.
+                                           */
+        struct work_struct del_work;    /* used for delayed sysfs removal */
+        struct sysfs_dirent *sysfs_state; /* handle for 'state'
+                                           * sysfs entry */
+};
+struct mddev_s
+{
+        void                            *private;
+        struct mdk_personality          *pers;
+        dev_t                           unit;
+        int                             md_minor;
+        struct list_head                disks;
+        unsigned long                   flags;
+#define MD_CHANGE_DEVS  0       /* Some device status has changed */
+#define MD_CHANGE_CLEAN 1       /* transition to or from 'clean' */
+#define MD_CHANGE_PENDING 2     /* superblock update in progress */
+        int                             suspended;
+        atomic_t                        active_io;
+        int                             ro;
+        struct gendisk                  *gendisk;
+        struct kobject                  kobj;
+        int                             hold_active;
+#define UNTIL_IOCTL     1
+#define UNTIL_STOP      2
+        /* Superblock information */
+        int                             major_version,
+                                        minor_version,
+                                        patch_version;
+        int                             persistent;
+        int                             external;       /* metadata is
+                                                         * managed externally */
+        char                            metadata_type[17]; /* externally set*/
+        int                             chunk_size;
+        time_t                          ctime, utime;
+        int                             level, layout;
+        char                            clevel[16];
+        int                             raid_disks;
+        int                             max_disks;
+        sector_t                        dev_sectors;    /* used size of
+                                                         * component devices */
+        sector_t                        array_sectors; /* exported array size */
+        int                             external_size; /* size managed
+                                                        * externally */
+        __u64                           events;
+        char                            uuid[16];
+        /* If the array is being reshaped, we need to record the
+         * new shape and an indication of where we are up to.
+         * This is written to the superblock.
+         * If reshape_position is MaxSector, then no reshape is happening (yet).
+         */
+        sector_t                        reshape_position;
+        int                             delta_disks, new_level, new_layout, new_chunk;
+        struct mdk_thread_s             *thread;        /* management thread */
+        struct mdk_thread_s             *sync_thread;   /* doing resync or reconstruct */
+        sector_t                        curr_resync;    /* last block scheduled */
+        /* As resync requests can complete out of order, we cannot easily track
+         * how much resync has been completed.  So we occasionally pause until
+         * everything completes, then set curr_resync_completed to curr_resync.
+         * As such it may be well behind the real resync mark, but it is a value
+         * we are certain of.
+         */
+        sector_t                        curr_resync_completed;
+        unsigned long                   resync_mark;    /* a recent timestamp */
+        sector_t                        resync_mark_cnt;/* blocks written at resync_mark */
+        sector_t                        curr_mark_cnt; /* blocks scheduled now */
+        sector_t                        resync_max_sectors; /* may be set by personality */
+        sector_t                        resync_mismatches; /* count of sectors where
+                                                            * parity/replica mismatch found
+                                                            */
+        /* allow user-space to request suspension of IO to regions of the array */
+        sector_t                        suspend_lo;
+        sector_t                        suspend_hi;
+        /* if zero, use the system-wide default */
+        int                             sync_speed_min;
+        int                             sync_speed_max;
+        /* resync even though the same disks are shared among md-devices */
+        int                             parallel_resync;
+        int                             ok_start_degraded;
+        /* recovery/resync flags 
+         * NEEDED:   we might need to start a resync/recover
+         * RUNNING:  a thread is running, or about to be started
+         * SYNC:     actually doing a resync, not a recovery
+         * RECOVER:  doing recovery, or need to try it.
+         * INTR:     resync needs to be aborted for some reason
+         * DONE:     thread is done and is waiting to be reaped
+         * REQUEST:  user-space has requested a sync (used with SYNC)
+         * CHECK:    user-space request for for check-only, no repair
+         * RESHAPE:  A reshape is happening
+         *
+         * If neither SYNC or RESHAPE are set, then it is a recovery.
+         */
+#define MD_RECOVERY_RUNNING     0
+#define MD_RECOVERY_SYNC        1
+#define MD_RECOVERY_RECOVER     2
+#define MD_RECOVERY_INTR        3
+#define MD_RECOVERY_DONE        4
+#define MD_RECOVERY_NEEDED      5
+#define MD_RECOVERY_REQUESTED   6
+#define MD_RECOVERY_CHECK       7
+#define MD_RECOVERY_RESHAPE     8
+#define MD_RECOVERY_FROZEN      9
+        unsigned long                   recovery;
+        int                             recovery_disabled; /* if we detect that recovery
+                                                            * will always fail, set this
+                                                            * so we don't loop trying */
+        int                             in_sync;        /* know to not need resync */
+        struct mutex                    reconfig_mutex;
+        atomic_t                        active;         /* general refcount */
+        atomic_t                        openers;        /* number of active opens */
+        int                             changed;        /* true if we might need to reread partition info */
+        int                             degraded;       /* whether md should consider
+                                                         * adding a spare
+                                                         */
+        int                             barriers_work;  /* initialised to true, cleared as soon
+                                                         * as a barrier request to slave
+                                                         * fails.  Only supported
+                                                         */
+        struct bio                      *biolist;       /* bios that need to be retried
+                                                         * because BIO_RW_BARRIER is not supported
+                                                         */
+        atomic_t                        recovery_active; /* blocks scheduled, but not written */
+        wait_queue_head_t               recovery_wait;
+        sector_t                        recovery_cp;
+        sector_t                        resync_min;     /* user requested sync
+                                                         * starts here */
+        sector_t                        resync_max;     /* resync should pause
+                                                         * when it gets here */
+        struct sysfs_dirent             *sysfs_state;   /* handle for 'array_state'
+                                                         * file in sysfs.
+                                                         */
+        struct sysfs_dirent             *sysfs_action;  /* handle for 'sync_action' */
+        struct work_struct del_work;    /* used for delayed sysfs removal */
+        spinlock_t                      write_lock;
+        wait_queue_head_t               sb_wait;        /* for waiting on superblock updates */
+        atomic_t                        pending_writes; /* number of active superblock writes */
+        unsigned int                    safemode;       /* if set, update "clean" superblock
+                                                         * when no writes pending.
+                                                         */ 
+        unsigned int                    safemode_delay;
+        struct timer_list               safemode_timer;
+        atomic_t                        writes_pending; 
+        struct request_queue            *queue; /* for plugging ... */
+        atomic_t                        write_behind; /* outstanding async IO */
+        unsigned int                    max_write_behind; /* 0 = sync */
+        struct bitmap                   *bitmap; /* the bitmap for the device */
+        struct file                     *bitmap_file; /* the bitmap file */
+        long                            bitmap_offset; /* offset from superblock of
+                                                        * start of bitmap. May be
+                                                        * negative, but not '0'
+                                                        */
+        long                            default_bitmap_offset; /* this is the offset to use when
+                                                                * hot-adding a bitmap.  It should
+                                                                * eventually be settable by sysfs.
+                                                                */
+        struct list_head                all_mddevs;
+};
+static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
+{
+        int faulty = test_bit(Faulty, &rdev->flags);
+        if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
+                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+}
+static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
+{
+        atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
+}
+struct mdk_personality
+{
+        char *name;
+        int level;
+        struct list_head list;
+        struct module *owner;
+        int (*make_request)(struct request_queue *q, struct bio *bio);
+        int (*run)(mddev_t *mddev);
+        int (*stop)(mddev_t *mddev);
+        void (*status)(struct seq_file *seq, mddev_t *mddev);
+        /* error_handler must set ->faulty and clear ->in_sync
+         * if appropriate, and should abort recovery if needed 
+         */
+        void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
+        int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
+        int (*hot_remove_disk) (mddev_t *mddev, int number);
+        int (*spare_active) (mddev_t *mddev);
+        sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
+        int (*resize) (mddev_t *mddev, sector_t sectors);
+        sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
+        int (*check_reshape) (mddev_t *mddev);
+        int (*start_reshape) (mddev_t *mddev);
+        void (*finish_reshape) (mddev_t *mddev);
+        int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
+        /* quiesce moves between quiescence states
+         * 0 - fully active
+         * 1 - no new requests allowed
+         * others - reserved
+         */
+        void (*quiesce) (mddev_t *mddev, int state);
+        /* takeover is used to transition an array from one
+         * personality to another.  The new personality must be able
+         * to handle the data in the current layout.
+         * e.g. 2drive raid1 -> 2drive raid5
+         *      ndrive raid5 -> degraded n+1drive raid6 with special layout
+         * If the takeover succeeds, a new 'private' structure is returned.
+         * This needs to be installed and then ->run used to activate the
+         * array.
+         */
+        void *(*takeover) (mddev_t *mddev);
+};
+struct md_sysfs_entry {
+        struct attribute attr;
+        ssize_t (*show)(mddev_t *, char *);
+        ssize_t (*store)(mddev_t *, const char *, size_t);
+};
+static inline char * mdname (mddev_t * mddev)
+{
+        return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
+}
+/*
+ * iterates through some rdev ringlist. It's safe to remove the
+ * current 'rdev'. Dont touch 'tmp' though.
+ */
+#define rdev_for_each_list(rdev, tmp, head)                             \
+        list_for_each_entry_safe(rdev, tmp, head, same_set)
+/*
+ * iterates through the 'same array disks' ringlist
+ */
+#define rdev_for_each(rdev, tmp, mddev)                         \
+        list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
+#define rdev_for_each_rcu(rdev, mddev)                          \
+        list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
+typedef struct mdk_thread_s {
+        void                    (*run) (mddev_t *mddev);
+        mddev_t                 *mddev;
+        wait_queue_head_t       wqueue;
+        unsigned long           flags;
+        struct task_struct      *tsk;
+        unsigned long           timeout;
+} mdk_thread_t;
+#define THREAD_WAKEUP  0
+#define __wait_event_lock_irq(wq, condition, lock, cmd)                 \
+do {                                                                    \
+        wait_queue_t __wait;                                            \
+        init_waitqueue_entry(&__wait, current);                         \
+                                                                        \
+        add_wait_queue(&wq, &__wait);                                   \
+        for (;;) {                                                      \
+                set_current_state(TASK_UNINTERRUPTIBLE);                \
+                if (condition)                                          \
+                        break;                                          \
+                spin_unlock_irq(&lock);                                 \
+                cmd;                                                    \
+                schedule();                                             \
+                spin_lock_irq(&lock);                                   \
+        }                                                               \
+        current->state = TASK_RUNNING;                                  \
+        remove_wait_queue(&wq, &__wait);                                \
+} while (0)
+#define wait_event_lock_irq(wq, condition, lock, cmd)                   \
+do {                                                                    \
+        if (condition)                                                  \
+                break;                                                  \
+        __wait_event_lock_irq(wq, condition, lock, cmd);                \
+} while (0)
+static inline void safe_put_page(struct page *p)
+{
+        if (p) put_page(p);
+}
+#endif /* CONFIG_BLOCK */
+#endif
+extern int register_md_personality(struct mdk_personality *p);
+extern int unregister_md_personality(struct mdk_personality *p);
+extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
+                                mddev_t *mddev, const char *name);
+extern void md_unregister_thread(mdk_thread_t *thread);
+extern void md_wakeup_thread(mdk_thread_t *thread);
+extern void md_check_recovery(mddev_t *mddev);
+extern void md_write_start(mddev_t *mddev, struct bio *bi);
+extern void md_write_end(mddev_t *mddev);
+extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
+extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
+extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+                           sector_t sector, int size, struct page *page);
+extern void md_super_wait(mddev_t *mddev);
+extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+                        struct page *page, int rw);
+extern void md_do_sync(mddev_t *mddev);
+extern void md_new_event(mddev_t *mddev);
+extern int md_allow_write(mddev_t *mddev);
+extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
+extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c
index b61d5767aae7..3b1500843bba 100644
--- a/drivers/md/mktables.c
+++ b/drivers/md/mktables.c
@@ -59,7 +59,7 @@ int main(int argc, char *argv[])
        uint8_t v;
        uint8_t exptbl[256], invtbl[256];
-        printf("#include \"raid6.h\"\n");
+        printf("#include <linux/raid/pq.h>\n");
        /* Compute multiplication table */
        printf("\nconst u8  __attribute__((aligned(256)))\n"
@@ -76,6 +76,9 @@ int main(int argc, char *argv[])
                printf("\t},\n");
        }
        printf("};\n");
+        printf("#ifdef __KERNEL__\n");
+        printf("EXPORT_SYMBOL(raid6_gfmul);\n");
+        printf("#endif\n");
        /* Compute power-of-2 table (exponent) */
        v = 1;
@@ -92,6 +95,9 @@ int main(int argc, char *argv[])
                }
        }
        printf("};\n");
+        printf("#ifdef __KERNEL__\n");
+        printf("EXPORT_SYMBOL(raid6_gfexp);\n");
+        printf("#endif\n");
        /* Compute inverse table x^-1 == x^254 */
        printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -104,6 +110,9 @@ int main(int argc, char *argv[])
                }
        }
        printf("};\n");
+        printf("#ifdef __KERNEL__\n");
+        printf("EXPORT_SYMBOL(raid6_gfinv);\n");
+        printf("#endif\n");
        /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
        printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -115,6 +124,9 @@ int main(int argc, char *argv[])
                               (j == 7) ? '\n' : ' ');
        }
        printf("};\n");
+        printf("#ifdef __KERNEL__\n");
+        printf("EXPORT_SYMBOL(raid6_gfexi);\n");
+        printf("#endif\n");
        return 0;
 }
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index f6d08f241671..41ced0cbe823 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -19,7 +19,11 @@
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
-#include <linux/raid/multipath.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "multipath.h"
 #define MAX_WORK_PER_DISK 128
@@ -402,6 +406,14 @@ static void multipathd (mddev_t *mddev)
        spin_unlock_irqrestore(&conf->device_lock, flags);
 }
+static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+        WARN_ONCE(sectors || raid_disks,
+                  "%s does not support generic reshape\n", __func__);
+        return mddev->dev_sectors;
+}
 static int multipath_run (mddev_t *mddev)
 {
        multipath_conf_t *conf;
@@ -498,7 +510,7 @@ static int multipath_run (mddev_t *mddev)
        /*
         * Ok, everything is just fine now
         */
-        mddev->array_sectors = mddev->size * 2;
+        md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
        mddev->queue->unplug_fn = multipath_unplug;
        mddev->queue->backing_dev_info.congested_fn = multipath_congested;
@@ -543,6 +555,7 @@ static struct mdk_personality multipath_personality =
        .error_handler  = multipath_error,
        .hot_add_disk   = multipath_add_disk,
        .hot_remove_disk= multipath_remove_disk,
+        .size           = multipath_size,
 };
 static int __init multipath_init (void)
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
new file mode 100644
index 000000000000..6fa70b400cda
--- /dev/null
+++ b/drivers/md/multipath.h
@@ -0,0 +1,40 @@
+#ifndef _MULTIPATH_H
+#define _MULTIPATH_H
+struct multipath_info {
+        mdk_rdev_t      *rdev;
+};
+struct multipath_private_data {
+        mddev_t                 *mddev;
+        struct multipath_info   *multipaths;
+        int                     raid_disks;
+        int                     working_disks;
+        spinlock_t              device_lock;
+        struct list_head        retry_list;
+        mempool_t               *pool;
+};
+typedef struct multipath_private_data multipath_conf_t;
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
+/*
+ * this is our 'private' 'collective' MULTIPATH buffer head.
+ * it contains information about what kind of IO operations were started
+ * for this MULTIPATH operation, and about their status:
+ */
+struct multipath_bh {
+        mddev_t                 *mddev;
+        struct bio              *master_bio;
+        struct bio              bio;
+        int                     path;
+        struct list_head        retry_list;
+};
+#endif
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c605ba805586..c08d7559be55 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -18,7 +18,10 @@
   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
-#include <linux/raid/raid0.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid0.h"
 static void raid0_unplug(struct request_queue *q)
 {
@@ -73,16 +76,15 @@ static int create_strip_zones (mddev_t *mddev)
                list_for_each_entry(rdev2, &mddev->disks, same_set) {
                        printk(KERN_INFO "raid0:   comparing %s(%llu)",
                               bdevname(rdev1->bdev,b),
-                               (unsigned long long)rdev1->size);
+                               (unsigned long long)rdev1->sectors);
                        printk(KERN_INFO " with %s(%llu)\n",
                               bdevname(rdev2->bdev,b),
-                               (unsigned long long)rdev2->size);
+                               (unsigned long long)rdev2->sectors);
                        if (rdev2 == rdev1) {
                                printk(KERN_INFO "raid0:   END\n");
                                break;
                        }
-                        if (rdev2->size == rdev1->size)
+                        if (rdev2->sectors == rdev1->sectors) {
-                        {
                                /*
                                 * Not unique, don't count it as a new
                                 * group
@@ -145,7 +147,7 @@ static int create_strip_zones (mddev_t *mddev)
                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-                if (!smallest || (rdev1->size <smallest->size))
+                if (!smallest || (rdev1->sectors < smallest->sectors))
                        smallest = rdev1;
                cnt++;
        }
@@ -155,10 +157,10 @@ static int create_strip_zones (mddev_t *mddev)
                goto abort;
        }
        zone->nb_dev = cnt;
-        zone->sectors = smallest->size * cnt * 2;
+        zone->sectors = smallest->sectors * cnt;
        zone->zone_start = 0;
-        current_start = smallest->size * 2;
+        current_start = smallest->sectors;
        curr_zone_start = zone->sectors;
        /* now do the other zones */
@@ -177,29 +179,29 @@ static int create_strip_zones (mddev_t *mddev)
                        rdev = conf->strip_zone[0].dev[j];
                        printk(KERN_INFO "raid0: checking %s ...",
                                bdevname(rdev->bdev, b));
-                        if (rdev->size > current_start / 2) {
+                        if (rdev->sectors <= current_start) {
-                                printk(KERN_INFO " contained as device %d\n",
-                                        c);
-                                zone->dev[c] = rdev;
-                                c++;
-                                if (!smallest || (rdev->size <smallest->size)) {
-                                        smallest = rdev;
-                                        printk(KERN_INFO "  (%llu) is smallest!.\n",
-                                                (unsigned long long)rdev->size);
-                                }
-                        } else
                                printk(KERN_INFO " nope.\n");
+                                continue;
+                        }
+                        printk(KERN_INFO " contained as device %d\n", c);
+                        zone->dev[c] = rdev;
+                        c++;
+                        if (!smallest || rdev->sectors < smallest->sectors) {
+                                smallest = rdev;
+                                printk(KERN_INFO "  (%llu) is smallest!.\n",
+                                        (unsigned long long)rdev->sectors);
+                        }
                }
                zone->nb_dev = c;
-                zone->sectors = (smallest->size * 2 - current_start) * c;
+                zone->sectors = (smallest->sectors - current_start) * c;
                printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
                        zone->nb_dev, (unsigned long long)zone->sectors);
                zone->zone_start = curr_zone_start;
                curr_zone_start += zone->sectors;
-                current_start = smallest->size * 2;
+                current_start = smallest->sectors;
                printk(KERN_INFO "raid0: current zone start: %llu\n",
                        (unsigned long long)current_start);
        }
@@ -261,12 +263,25 @@ static int raid0_mergeable_bvec(struct request_queue *q,
                return max;
 }
+static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+        sector_t array_sectors = 0;
+        mdk_rdev_t *rdev;
+        WARN_ONCE(sectors || raid_disks,
+                  "%s does not support generic reshape\n", __func__);
+        list_for_each_entry(rdev, &mddev->disks, same_set)
+                array_sectors += rdev->sectors;
+        return array_sectors;
+}
 static int raid0_run (mddev_t *mddev)
 {
        unsigned  cur=0, i=0, nb_zone;
        s64 sectors;
        raid0_conf_t *conf;
-        mdk_rdev_t *rdev;
        if (mddev->chunk_size == 0) {
                printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
@@ -291,16 +306,14 @@ static int raid0_run (mddev_t *mddev)
                goto out_free_conf;
        /* calculate array device size */
-        mddev->array_sectors = 0;
+        md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
-        list_for_each_entry(rdev, &mddev->disks, same_set)
-                mddev->array_sectors += rdev->size * 2;
        printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
                (unsigned long long)mddev->array_sectors);
        printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
                (unsigned long long)conf->spacing);
        {
-                sector_t s = mddev->array_sectors;
+                sector_t s = raid0_size(mddev, 0, 0);
                sector_t space = conf->spacing;
                int round;
                conf->sector_shift = 0;
@@ -509,6 +522,7 @@ static struct mdk_personality raid0_personality=
        .run            = raid0_run,
        .stop           = raid0_stop,
        .status         = raid0_status,
+        .size           = raid0_size,
 };
 static int __init raid0_init (void)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
new file mode 100644
index 000000000000..824b12eb1d4f
--- /dev/null
+++ b/drivers/md/raid0.h
@@ -0,0 +1,28 @@
+#ifndef _RAID0_H
+#define _RAID0_H
+struct strip_zone
+{
+        sector_t zone_start;    /* Zone offset in md_dev (in sectors) */
+        sector_t dev_start;     /* Zone offset in real dev (in sectors) */
+        sector_t sectors;       /* Zone size in sectors */
+        int nb_dev;             /* # of devices attached to the zone */
+        mdk_rdev_t **dev;       /* Devices attached to the zone */
+};
+struct raid0_private_data
+{
+        struct strip_zone **hash_table; /* Table of indexes into strip_zone */
+        struct strip_zone *strip_zone;
+        mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
+        int nr_strip_zones;
+        sector_t spacing;
+        int sector_shift; /* shift this before divide by spacing */
+};
+typedef struct raid0_private_data raid0_conf_t;
+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
+#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e2466425d9ca..b4f4badc0068 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -31,10 +31,13 @@
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
-#include "dm-bio-list.h"
 #include <linux/delay.h>
-#include <linux/raid/raid1.h>
+#include <linux/blkdev.h>
-#include <linux/raid/bitmap.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "dm-bio-list.h"
+#include "raid1.h"
+#include "bitmap.h"
 #define DEBUG 0
 #if DEBUG
@@ -1723,7 +1726,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                        return 0;
        }
-        max_sector = mddev->size << 1;
+        max_sector = mddev->dev_sectors;
        if (sector_nr >= max_sector) {
                /* If we aborted, we need to abort the
                 * sync on the 'current' bitmap chunk (there will
@@ -1919,6 +1922,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
        return nr_sectors;
 }
+static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+        if (sectors)
+                return sectors;
+        return mddev->dev_sectors;
+}
 static int run(mddev_t *mddev)
 {
        conf_t *conf;
@@ -2048,7 +2059,7 @@ static int run(mddev_t *mddev)
        /*
         * Ok, everything is just fine now
         */
-        mddev->array_sectors = mddev->size * 2;
+        md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
        mddev->queue->unplug_fn = raid1_unplug;
        mddev->queue->backing_dev_info.congested_fn = raid1_congested;
@@ -2089,6 +2100,9 @@ static int stop(mddev_t *mddev)
                /* need to kick something here to make sure I/O goes? */
        }
+        raise_barrier(conf);
+        lower_barrier(conf);
        md_unregister_thread(mddev->thread);
        mddev->thread = NULL;
        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -2110,15 +2124,17 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
         * any io in the removed space completes, but it hardly seems
         * worth it.
         */
-        mddev->array_sectors = sectors;
+        md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
+        if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
+                return -EINVAL;
        set_capacity(mddev->gendisk, mddev->array_sectors);
        mddev->changed = 1;
-        if (mddev->array_sectors / 2 > mddev->size &&
+        if (sectors > mddev->dev_sectors &&
            mddev->recovery_cp == MaxSector) {
-                mddev->recovery_cp = mddev->size << 1;
+                mddev->recovery_cp = mddev->dev_sectors;
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        }
-        mddev->size = mddev->array_sectors / 2;
+        mddev->dev_sectors = sectors;
        mddev->resync_max_sectors = sectors;
        return 0;
 }
@@ -2264,6 +2280,7 @@ static struct mdk_personality raid1_personality =
        .spare_active   = raid1_spare_active,
        .sync_request   = sync_request,
        .resize         = raid1_resize,
+        .size           = raid1_size,
        .check_reshape  = raid1_reshape,
        .quiesce        = raid1_quiesce,
 };
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
new file mode 100644
index 000000000000..1620eea3d57c
--- /dev/null
+++ b/drivers/md/raid1.h
@@ -0,0 +1,132 @@
+#ifndef _RAID1_H
+#define _RAID1_H
+typedef struct mirror_info mirror_info_t;
+struct mirror_info {
+        mdk_rdev_t      *rdev;
+        sector_t        head_position;
+};
+/*
+ * memory pools need a pointer to the mddev, so they can force an unplug
+ * when memory is tight, and a count of the number of drives that the
+ * pool was allocated for, so they know how much to allocate and free.
+ * mddev->raid_disks cannot be used, as it can change while a pool is active
+ * These two datums are stored in a kmalloced struct.
+ */
+struct pool_info {
+        mddev_t *mddev;
+        int     raid_disks;
+};
+typedef struct r1bio_s r1bio_t;
+struct r1_private_data_s {
+        mddev_t                 *mddev;
+        mirror_info_t           *mirrors;
+        int                     raid_disks;
+        int                     last_used;
+        sector_t                next_seq_sect;
+        spinlock_t              device_lock;
+        struct list_head        retry_list;
+        /* queue pending writes and submit them on unplug */
+        struct bio_list         pending_bio_list;
+        /* queue of writes that have been unplugged */
+        struct bio_list         flushing_bio_list;
+        /* for use when syncing mirrors: */
+        spinlock_t              resync_lock;
+        int                     nr_pending;
+        int                     nr_waiting;
+        int                     nr_queued;
+        int                     barrier;
+        sector_t                next_resync;
+        int                     fullsync;  /* set to 1 if a full sync is needed,
+                                            * (fresh device added).
+                                            * Cleared when a sync completes.
+                                            */
+        wait_queue_head_t       wait_barrier;
+        struct pool_info        *poolinfo;
+        struct page             *tmppage;
+        mempool_t *r1bio_pool;
+        mempool_t *r1buf_pool;
+};
+typedef struct r1_private_data_s conf_t;
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
+/*
+ * this is our 'private' RAID1 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID1 operation, and about their status:
+ */
+struct r1bio_s {
+        atomic_t                remaining; /* 'have we finished' count,
+                                            * used from IRQ handlers
+                                            */
+        atomic_t                behind_remaining; /* number of write-behind ios remaining
+                                                 * in this BehindIO request
+                                                 */
+        sector_t                sector;
+        int                     sectors;
+        unsigned long           state;
+        mddev_t                 *mddev;
+        /*
+         * original bio going to /dev/mdx
+         */
+        struct bio              *master_bio;
+        /*
+         * if the IO is in READ direction, then this is where we read
+         */
+        int                     read_disk;
+        struct list_head        retry_list;
+        struct bitmap_update    *bitmap_update;
+        /*
+         * if the IO is in WRITE direction, then multiple bios are used.
+         * We choose the number when they are allocated.
+         */
+        struct bio              *bios[0];
+        /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
+};
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+/* bits for r1bio.state */
+#define R1BIO_Uptodate  0
+#define R1BIO_IsSync    1
+#define R1BIO_Degraded  2
+#define R1BIO_BehindIO  3
+#define R1BIO_Barrier   4
+#define R1BIO_BarrierRetry 5
+/* For write-behind requests, we call bi_end_io when
+ * the last non-write-behind device completes, providing
+ * any write was successful.  Otherwise we call when
+ * any write-behind write succeeds, otherwise we call
+ * with failure when last write completes (and all failed).
+ * Record that bi_end_io was called with this flag...
+ */
+#define R1BIO_Returned 6
+#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7301631abe04..e293d92641ac 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,10 +18,13 @@
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
-#include "dm-bio-list.h"
 #include <linux/delay.h>
-#include <linux/raid/raid10.h>
+#include <linux/blkdev.h>
-#include <linux/raid/bitmap.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "dm-bio-list.h"
+#include "raid10.h"
+#include "bitmap.h"
 /*
 * RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -1695,7 +1698,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                        return 0;
 skipped:
-        max_sector = mddev->size << 1;
+        max_sector = mddev->dev_sectors;
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
                max_sector = mddev->resync_max_sectors;
        if (sector_nr >= max_sector) {
@@ -2020,6 +2023,25 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
        goto skipped;
 }
+static sector_t
+raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+        sector_t size;
+        conf_t *conf = mddev_to_conf(mddev);
+        if (!raid_disks)
+                raid_disks = mddev->raid_disks;
+        if (!sectors)
+                sectors = mddev->dev_sectors;
+        size = sectors >> conf->chunk_shift;
+        sector_div(size, conf->far_copies);
+        size = size * raid_disks;
+        sector_div(size, conf->near_copies);
+        return size << conf->chunk_shift;
+}
 static int run(mddev_t *mddev)
 {
        conf_t *conf;
@@ -2076,7 +2098,7 @@ static int run(mddev_t *mddev)
        conf->far_offset = fo;
        conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
        conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
-        size = mddev->size >> (conf->chunk_shift-1);
+        size = mddev->dev_sectors >> conf->chunk_shift;
        sector_div(size, fc);
        size = size * conf->raid_disks;
        sector_div(size, nc);
@@ -2089,7 +2111,7 @@ static int run(mddev_t *mddev)
         */
        stride += conf->raid_disks - 1;
        sector_div(stride, conf->raid_disks);
-        mddev->size = stride  << (conf->chunk_shift-1);
+        mddev->dev_sectors = stride << conf->chunk_shift;
        if (fo)
                stride = 1;
@@ -2171,8 +2193,8 @@ static int run(mddev_t *mddev)
        /*
         * Ok, everything is just fine now
         */
-        mddev->array_sectors = size << conf->chunk_shift;
+        md_set_array_sectors(mddev, raid10_size(mddev, 0, 0));
-        mddev->resync_max_sectors = size << conf->chunk_shift;
+        mddev->resync_max_sectors = raid10_size(mddev, 0, 0);
        mddev->queue->unplug_fn = raid10_unplug;
        mddev->queue->backing_dev_info.congested_fn = raid10_congested;
@@ -2208,6 +2230,9 @@ static int stop(mddev_t *mddev)
 {
        conf_t *conf = mddev_to_conf(mddev);
+        raise_barrier(conf, 0);
+        lower_barrier(conf);
        md_unregister_thread(mddev->thread);
        mddev->thread = NULL;
        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -2255,6 +2280,7 @@ static struct mdk_personality raid10_personality =
        .spare_active   = raid10_spare_active,
        .sync_request   = sync_request,
        .quiesce        = raid10_quiesce,
+        .size           = raid10_size,
 };
 static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
new file mode 100644
index 000000000000..244dbe507a54
--- /dev/null
+++ b/drivers/md/raid10.h
@@ -0,0 +1,121 @@
+#ifndef _RAID10_H
+#define _RAID10_H
+typedef struct mirror_info mirror_info_t;
+struct mirror_info {
+        mdk_rdev_t      *rdev;
+        sector_t        head_position;
+};
+typedef struct r10bio_s r10bio_t;
+struct r10_private_data_s {
+        mddev_t                 *mddev;
+        mirror_info_t           *mirrors;
+        int                     raid_disks;
+        spinlock_t              device_lock;
+        /* geometry */
+        int                     near_copies;  /* number of copies layed out raid0 style */
+        int                     far_copies;   /* number of copies layed out
+                                               * at large strides across drives
+                                               */
+        int                     far_offset;   /* far_copies are offset by 1 stripe
+                                               * instead of many
+                                               */
+        int                     copies;       /* near_copies * far_copies.
+                                               * must be <= raid_disks
+                                               */
+        sector_t                stride;       /* distance between far copies.
+                                               * This is size / far_copies unless
+                                               * far_offset, in which case it is
+                                               * 1 stripe.
+                                               */
+        int chunk_shift; /* shift from chunks to sectors */
+        sector_t chunk_mask;
+        struct list_head        retry_list;
+        /* queue pending writes and submit them on unplug */
+        struct bio_list         pending_bio_list;
+        spinlock_t              resync_lock;
+        int nr_pending;
+        int nr_waiting;
+        int nr_queued;
+        int barrier;
+        sector_t                next_resync;
+        int                     fullsync;  /* set to 1 if a full sync is needed,
+                                            * (fresh device added).
+                                            * Cleared when a sync completes.
+                                            */
+        wait_queue_head_t       wait_barrier;
+        mempool_t *r10bio_pool;
+        mempool_t *r10buf_pool;
+        struct page             *tmppage;
+};
+typedef struct r10_private_data_s conf_t;
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
+/*
+ * this is our 'private' RAID10 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID10 operation, and about their status:
+ */
+struct r10bio_s {
+        atomic_t                remaining; /* 'have we finished' count,
+                                            * used from IRQ handlers
+                                            */
+        sector_t                sector; /* virtual sector number */
+        int                     sectors;
+        unsigned long           state;
+        mddev_t                 *mddev;
+        /*
+         * original bio going to /dev/mdx
+         */
+        struct bio              *master_bio;
+        /*
+         * if the IO is in READ direction, then this is where we read
+         */
+        int                     read_slot;
+        struct list_head        retry_list;
+        /*
+         * if the IO is in WRITE direction, then multiple bios are used,
+         * one for each copy.
+         * When resyncing we also use one for each copy.
+         * When reconstructing, we use 2 bios, one for read, one for write.
+         * We choose the number when they are allocated.
+         */
+        struct {
+                struct bio              *bio;
+                sector_t addr;
+                int devnum;
+        } devs[0];
+};
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+/* bits for r10bio.state */
+#define R10BIO_Uptodate 0
+#define R10BIO_IsSync   1
+#define R10BIO_IsRecover 2
+#define R10BIO_Degraded 3
+#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a5ba080d303b..3bbc6d647044 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -43,11 +43,14 @@
 * miss any bits.
 */
+#include <linux/blkdev.h>
 #include <linux/kthread.h>
-#include "raid6.h"
+#include <linux/raid/pq.h>
-#include <linux/raid/bitmap.h>
 #include <linux/async_tx.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid5.h"
+#include "bitmap.h"
 /*
 * Stripe cache
@@ -91,11 +94,6 @@
 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
-#if !RAID6_USE_EMPTY_ZERO_PAGE
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-#endif
 /*
 * We maintain a biased count of active stripes in the bottom 16 bits of
 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -130,12 +128,42 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
        bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
 }
+/* Find first data disk in a raid6 stripe */
+static inline int raid6_d0(struct stripe_head *sh)
+{
+        if (sh->ddf_layout)
+                /* ddf always start from first device */
+                return 0;
+        /* md starts just after Q block */
+        if (sh->qd_idx == sh->disks - 1)
+                return 0;
+        else
+                return sh->qd_idx + 1;
+}
 static inline int raid6_next_disk(int disk, int raid_disks)
 {
        disk++;
        return (disk < raid_disks) ? disk : 0;
 }
+/* When walking through the disks in a raid5, starting at raid6_d0,
+ * We need to map each disk to a 'slot', where the data disks are slot
+ * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
+ * is raid_disks-1.  This help does that mapping.
+ */
+static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
+                             int *count, int syndrome_disks)
+{
+        int slot;
+        if (idx == sh->pd_idx)
+                return syndrome_disks;
+        if (idx == sh->qd_idx)
+                return syndrome_disks + 1;
+        slot = (*count)++;
+        return slot;
+}
 static void return_io(struct bio *return_bi)
 {
        struct bio *bi = return_bi;
@@ -193,6 +221,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
                }
        }
 }
 static void release_stripe(struct stripe_head *sh)
 {
        raid5_conf_t *conf = sh->raid_conf;
@@ -270,9 +299,11 @@ static int grow_buffers(struct stripe_head *sh, int num)
        return 0;
 }
-static void raid5_build_block(struct stripe_head *sh, int i);
+static void raid5_build_block(struct stripe_head *sh, int i, int previous);
+static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
+                            struct stripe_head *sh);
-static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
+static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 {
        raid5_conf_t *conf = sh->raid_conf;
        int i;
@@ -287,11 +318,12 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
        remove_hash(sh);
+        sh->generation = conf->generation - previous;
+        sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
        sh->sector = sector;
-        sh->pd_idx = pd_idx;
+        stripe_set_idx(sector, conf, previous, sh);
        sh->state = 0;
-        sh->disks = disks;
        for (i = sh->disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
@@ -305,12 +337,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
                        BUG();
                }
                dev->flags = 0;
-                raid5_build_block(sh, i);
+                raid5_build_block(sh, i, previous);
        }
        insert_hash(conf, sh);
 }
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
+                                         short generation)
 {
        struct stripe_head *sh;
        struct hlist_node *hn;
@@ -318,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
        CHECK_DEVLOCK();
        pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
        hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
-                if (sh->sector == sector && sh->disks == disks)
+                if (sh->sector == sector && sh->generation == generation)
                        return sh;
        pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
        return NULL;
@@ -327,8 +360,9 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
 static void unplug_slaves(mddev_t *mddev);
 static void raid5_unplug_device(struct request_queue *q);
-static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
+static struct stripe_head *
-                                             int pd_idx, int noblock)
+get_active_stripe(raid5_conf_t *conf, sector_t sector,
+                  int previous, int noblock)
 {
        struct stripe_head *sh;
@@ -340,7 +374,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
                wait_event_lock_irq(conf->wait_for_stripe,
                                    conf->quiesce == 0,
                                    conf->device_lock, /* nothing */);
-                sh = __find_stripe(conf, sector, disks);
+                sh = __find_stripe(conf, sector, conf->generation - previous);
                if (!sh) {
                        if (!conf->inactive_blocked)
                                sh = get_free_stripe(conf);
@@ -358,10 +392,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
                                        );
                                conf->inactive_blocked = 0;
                        } else
-                                init_stripe(sh, sector, pd_idx, disks);
+                                init_stripe(sh, sector, previous);
                } else {
                        if (atomic_read(&sh->count)) {
-                          BUG_ON(!list_empty(&sh->lru));
+                                BUG_ON(!list_empty(&sh->lru)
+                                    && !test_bit(STRIPE_EXPANDING, &sh->state));
                        } else {
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
@@ -895,8 +930,10 @@ static int grow_stripes(raid5_conf_t *conf, int num)
        struct kmem_cache *sc;
        int devs = conf->raid_disks;
-        sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev));
+        sprintf(conf->cache_name[0],
-        sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev));
+                "raid%d-%s", conf->level, mdname(conf->mddev));
+        sprintf(conf->cache_name[1],
+                "raid%d-%s-alt", conf->level, mdname(conf->mddev));
        conf->active_name = 0;
        sc = kmem_cache_create(conf->cache_name[conf->active_name],
                               sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
@@ -911,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num)
        return 0;
 }
-#ifdef CONFIG_MD_RAID5_RESHAPE
 static int resize_stripes(raid5_conf_t *conf, int newsize)
 {
        /* Make all the stripes able to hold 'newsize' devices.
@@ -1036,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
        conf->pool_size = newsize;
        return err;
 }
-#endif
 static int drop_one_stripe(raid5_conf_t *conf)
 {
@@ -1066,7 +1101,7 @@ static void shrink_stripes(raid5_conf_t *conf)
 static void raid5_end_read_request(struct bio * bi, int error)
 {
-        struct stripe_head *sh = bi->bi_private;
+        struct stripe_head *sh = bi->bi_private;
        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks, i;
        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1148,7 +1183,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 static void raid5_end_write_request(struct bio *bi, int error)
 {
-        struct stripe_head *sh = bi->bi_private;
+        struct stripe_head *sh = bi->bi_private;
        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks, i;
        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1176,9 +1211,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
 }
-static sector_t compute_blocknr(struct stripe_head *sh, int i);
+static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
        
-static void raid5_build_block(struct stripe_head *sh, int i)
+static void raid5_build_block(struct stripe_head *sh, int i, int previous)
 {
        struct r5dev *dev = &sh->dev[i];
@@ -1194,7 +1229,7 @@ static void raid5_build_block(struct stripe_head *sh, int i)
        dev->req.bi_private = sh;
        dev->flags = 0;
-        dev->sector = compute_blocknr(sh, i);
+        dev->sector = compute_blocknr(sh, i, previous);
 }
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
@@ -1227,15 +1262,23 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 * Input: a 'big' sector number,
 * Output: index of the data and parity disk, and the sector # in them.
 */
-static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
+static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
-                        unsigned int data_disks, unsigned int * dd_idx,
+                                     int previous, int *dd_idx,
-                        unsigned int * pd_idx, raid5_conf_t *conf)
+                                     struct stripe_head *sh)
 {
        long stripe;
        unsigned long chunk_number;
        unsigned int chunk_offset;
+        int pd_idx, qd_idx;
+        int ddf_layout = 0;
        sector_t new_sector;
-        int sectors_per_chunk = conf->chunk_size >> 9;
+        int algorithm = previous ? conf->prev_algo
+                                 : conf->algorithm;
+        int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
+                                         : (conf->chunk_size >> 9);
+        int raid_disks = previous ? conf->previous_raid_disks
+                                  : conf->raid_disks;
+        int data_disks = raid_disks - conf->max_degraded;
        /* First compute the information on this sector */
@@ -1259,68 +1302,170 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
        /*
         * Select the parity disk based on the user selected algorithm.
         */
+        pd_idx = qd_idx = ~0;
        switch(conf->level) {
        case 4:
-                *pd_idx = data_disks;
+                pd_idx = data_disks;
                break;
        case 5:
-                switch (conf->algorithm) {
+                switch (algorithm) {
                case ALGORITHM_LEFT_ASYMMETRIC:
-                        *pd_idx = data_disks - stripe % raid_disks;
+                        pd_idx = data_disks - stripe % raid_disks;
-                        if (*dd_idx >= *pd_idx)
+                        if (*dd_idx >= pd_idx)
                                (*dd_idx)++;
                        break;
                case ALGORITHM_RIGHT_ASYMMETRIC:
-                        *pd_idx = stripe % raid_disks;
+                        pd_idx = stripe % raid_disks;
-                        if (*dd_idx >= *pd_idx)
+                        if (*dd_idx >= pd_idx)
                                (*dd_idx)++;
                        break;
                case ALGORITHM_LEFT_SYMMETRIC:
-                        *pd_idx = data_disks - stripe % raid_disks;
+                        pd_idx = data_disks - stripe % raid_disks;
-                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+                        *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
                        break;
                case ALGORITHM_RIGHT_SYMMETRIC:
-                        *pd_idx = stripe % raid_disks;
+                        pd_idx = stripe % raid_disks;
-                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+                        *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
+                        break;
+                case ALGORITHM_PARITY_0:
+                        pd_idx = 0;
+                        (*dd_idx)++;
+                        break;
+                case ALGORITHM_PARITY_N:
+                        pd_idx = data_disks;
                        break;
                default:
                        printk(KERN_ERR "raid5: unsupported algorithm %d\n",
-                                conf->algorithm);
+                                algorithm);
+                        BUG();
                }
                break;
        case 6:
-                /**** FIX THIS ****/
+                switch (algorithm) {
-                switch (conf->algorithm) {
                case ALGORITHM_LEFT_ASYMMETRIC:
-                        *pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                        pd_idx = raid_disks - 1 - (stripe % raid_disks);
-                        if (*pd_idx == raid_disks-1)
+                        qd_idx = pd_idx + 1;
-                                (*dd_idx)++;    /* Q D D D P */
+                        if (pd_idx == raid_disks-1) {
-                        else if (*dd_idx >= *pd_idx)
+                                (*dd_idx)++;    /* Q D D D P */
+                                qd_idx = 0;
+                        } else if (*dd_idx >= pd_idx)
                                (*dd_idx) += 2; /* D D P Q D */
                        break;
                case ALGORITHM_RIGHT_ASYMMETRIC:
-                        *pd_idx = stripe % raid_disks;
+                        pd_idx = stripe % raid_disks;
-                        if (*pd_idx == raid_disks-1)
+                        qd_idx = pd_idx + 1;
-                                (*dd_idx)++;    /* Q D D D P */
+                        if (pd_idx == raid_disks-1) {
-                        else if (*dd_idx >= *pd_idx)
+                                (*dd_idx)++;    /* Q D D D P */
+                                qd_idx = 0;
+                        } else if (*dd_idx >= pd_idx)
                                (*dd_idx) += 2; /* D D P Q D */
                        break;
                case ALGORITHM_LEFT_SYMMETRIC:
-                        *pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                        pd_idx = raid_disks - 1 - (stripe % raid_disks);
-                        *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+                        qd_idx = (pd_idx + 1) % raid_disks;
+                        *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
                        break;
                case ALGORITHM_RIGHT_SYMMETRIC:
-                        *pd_idx = stripe % raid_disks;
+                        pd_idx = stripe % raid_disks;
-                        *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+                        qd_idx = (pd_idx + 1) % raid_disks;
+                        *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
+                        break;
+                case ALGORITHM_PARITY_0:
+                        pd_idx = 0;
+                        qd_idx = 1;
+                        (*dd_idx) += 2;
+                        break;
+                case ALGORITHM_PARITY_N:
+                        pd_idx = data_disks;
+                        qd_idx = data_disks + 1;
                        break;
+                case ALGORITHM_ROTATING_ZERO_RESTART:
+                        /* Exactly the same as RIGHT_ASYMMETRIC, but or
+                         * of blocks for computing Q is different.
+                         */
+                        pd_idx = stripe % raid_disks;
+                        qd_idx = pd_idx + 1;
+                        if (pd_idx == raid_disks-1) {
+                                (*dd_idx)++;    /* Q D D D P */
+                                qd_idx = 0;
+                        } else if (*dd_idx >= pd_idx)
+                                (*dd_idx) += 2; /* D D P Q D */
+                        ddf_layout = 1;
+                        break;
+                case ALGORITHM_ROTATING_N_RESTART:
+                        /* Same a left_asymmetric, by first stripe is
+                         * D D D P Q  rather than
+                         * Q D D D P
+                         */
+                        pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks);
+                        qd_idx = pd_idx + 1;
+                        if (pd_idx == raid_disks-1) {
+                                (*dd_idx)++;    /* Q D D D P */
+                                qd_idx = 0;
+                        } else if (*dd_idx >= pd_idx)
+                                (*dd_idx) += 2; /* D D P Q D */
+                        ddf_layout = 1;
+                        break;
+                case ALGORITHM_ROTATING_N_CONTINUE:
+                        /* Same as left_symmetric but Q is before P */
+                        pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                        qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
+                        *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
+                        ddf_layout = 1;
+                        break;
+                case ALGORITHM_LEFT_ASYMMETRIC_6:
+                        /* RAID5 left_asymmetric, with Q on last device */
+                        pd_idx = data_disks - stripe % (raid_disks-1);
+                        if (*dd_idx >= pd_idx)
+                                (*dd_idx)++;
+                        qd_idx = raid_disks - 1;
+                        break;
+                case ALGORITHM_RIGHT_ASYMMETRIC_6:
+                        pd_idx = stripe % (raid_disks-1);
+                        if (*dd_idx >= pd_idx)
+                                (*dd_idx)++;
+                        qd_idx = raid_disks - 1;
+                        break;
+                case ALGORITHM_LEFT_SYMMETRIC_6:
+                        pd_idx = data_disks - stripe % (raid_disks-1);
+                        *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
+                        qd_idx = raid_disks - 1;
+                        break;
+                case ALGORITHM_RIGHT_SYMMETRIC_6:
+                        pd_idx = stripe % (raid_disks-1);
+                        *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
+                        qd_idx = raid_disks - 1;
+                        break;
+                case ALGORITHM_PARITY_0_6:
+                        pd_idx = 0;
+                        (*dd_idx)++;
+                        qd_idx = raid_disks - 1;
+                        break;
                default:
                        printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
-                               conf->algorithm);
+                               algorithm);
+                        BUG();
                }
                break;
        }
+        if (sh) {
+                sh->pd_idx = pd_idx;
+                sh->qd_idx = qd_idx;
+                sh->ddf_layout = ddf_layout;
+        }
        /*
         * Finally, compute the new sector number
         */
@@ -1329,17 +1474,21 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
 }
-static sector_t compute_blocknr(struct stripe_head *sh, int i)
+static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 {
        raid5_conf_t *conf = sh->raid_conf;
        int raid_disks = sh->disks;
        int data_disks = raid_disks - conf->max_degraded;
        sector_t new_sector = sh->sector, check;
-        int sectors_per_chunk = conf->chunk_size >> 9;
+        int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
+                                         : (conf->chunk_size >> 9);
+        int algorithm = previous ? conf->prev_algo
+                                 : conf->algorithm;
        sector_t stripe;
        int chunk_offset;
-        int chunk_number, dummy1, dummy2, dd_idx = i;
+        int chunk_number, dummy1, dd_idx = i;
        sector_t r_sector;
+        struct stripe_head sh2;
        chunk_offset = sector_div(new_sector, sectors_per_chunk);
@@ -1351,7 +1500,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
        switch(conf->level) {
        case 4: break;
        case 5:
-                switch (conf->algorithm) {
+                switch (algorithm) {
                case ALGORITHM_LEFT_ASYMMETRIC:
                case ALGORITHM_RIGHT_ASYMMETRIC:
                        if (i > sh->pd_idx)
@@ -1363,19 +1512,27 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
                                i += raid_disks;
                        i -= (sh->pd_idx + 1);
                        break;
+                case ALGORITHM_PARITY_0:
+                        i -= 1;
+                        break;
+                case ALGORITHM_PARITY_N:
+                        break;
                default:
                        printk(KERN_ERR "raid5: unsupported algorithm %d\n",
-                               conf->algorithm);
+                               algorithm);
+                        BUG();
                }
                break;
        case 6:
-                if (i == raid6_next_disk(sh->pd_idx, raid_disks))
+                if (i == sh->qd_idx)
                        return 0; /* It is the Q disk */
-                switch (conf->algorithm) {
+                switch (algorithm) {
                case ALGORITHM_LEFT_ASYMMETRIC:
                case ALGORITHM_RIGHT_ASYMMETRIC:
-                        if (sh->pd_idx == raid_disks-1)
+                case ALGORITHM_ROTATING_ZERO_RESTART:
-                                i--;    /* Q D D D P */
+                case ALGORITHM_ROTATING_N_RESTART:
+                        if (sh->pd_idx == raid_disks-1)
+                                i--;    /* Q D D D P */
                        else if (i > sh->pd_idx)
                                i -= 2; /* D D P Q D */
                        break;
@@ -1390,9 +1547,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
                                i -= (sh->pd_idx + 2);
                        }
                        break;
+                case ALGORITHM_PARITY_0:
+                        i -= 2;
+                        break;
+                case ALGORITHM_PARITY_N:
+                        break;
+                case ALGORITHM_ROTATING_N_CONTINUE:
+                        if (sh->pd_idx == 0)
+                                i--;    /* P D D D Q */
+                        else if (i > sh->pd_idx)
+                                i -= 2; /* D D Q P D */
+                        break;
+                case ALGORITHM_LEFT_ASYMMETRIC_6:
+                case ALGORITHM_RIGHT_ASYMMETRIC_6:
+                        if (i > sh->pd_idx)
+                                i--;
+                        break;
+                case ALGORITHM_LEFT_SYMMETRIC_6:
+                case ALGORITHM_RIGHT_SYMMETRIC_6:
+                        if (i < sh->pd_idx)
+                                i += data_disks + 1;
+                        i -= (sh->pd_idx + 1);
+                        break;
+                case ALGORITHM_PARITY_0_6:
+                        i -= 1;
+                        break;
                default:
                        printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
-                               conf->algorithm);
+                               algorithm);
+                        BUG();
                }
                break;
        }
@@ -1400,8 +1583,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
        chunk_number = stripe * data_disks + i;
        r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
-        check = raid5_compute_sector(r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
+        check = raid5_compute_sector(conf, r_sector,
-        if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
+                                     previous, &dummy1, &sh2);
+        if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
+                || sh2.qd_idx != sh->qd_idx) {
                printk(KERN_ERR "compute_blocknr: map not correct\n");
                return 0;
        }
@@ -1468,14 +1653,16 @@ static void copy_data(int frombio, struct bio *bio,
 static void compute_parity6(struct stripe_head *sh, int method)
 {
-        raid6_conf_t *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
-        int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
+        int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
+        int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
        struct bio *chosen;
        /**** FIX THIS: This could be very bad if disks is close to 256 ****/
-        void *ptrs[disks];
+        void *ptrs[syndrome_disks+2];
-        qd_idx = raid6_next_disk(pd_idx, disks);
+        pd_idx = sh->pd_idx;
-        d0_idx = raid6_next_disk(qd_idx, disks);
+        qd_idx = sh->qd_idx;
+        d0_idx = raid6_d0(sh);
        pr_debug("compute_parity, stripe %llu, method %d\n",
                (unsigned long long)sh->sector, method);
@@ -1513,24 +1700,29 @@ static void compute_parity6(struct stripe_head *sh, int method)
                        set_bit(R5_UPTODATE, &sh->dev[i].flags);
                }
-//      switch(method) {
+        /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
-//      case RECONSTRUCT_WRITE:
-//      case CHECK_PARITY:
+        for (i = 0; i < disks; i++)
-//      case UPDATE_PARITY:
+                ptrs[i] = (void *)raid6_empty_zero_page;
-                /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
-                /* FIX: Is this ordering of drives even remotely optimal? */
+        count = 0;
-                count = 0;
+        i = d0_idx;
-                i = d0_idx;
+        do {
-                do {
+                int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
-                        ptrs[count++] = page_address(sh->dev[i].page);
-                        if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
+                ptrs[slot] = page_address(sh->dev[i].page);
-                                printk("block %d/%d not uptodate on parity calc\n", i,count);
+                if (slot < syndrome_disks &&
-                        i = raid6_next_disk(i, disks);
+                    !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
-                } while ( i != d0_idx );
+                        printk(KERN_ERR "block %d/%d not uptodate "
-//              break;
+                               "on parity calc\n", i, count);
-//      }
+                        BUG();
+                }
-        raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
+                i = raid6_next_disk(i, disks);
+        } while (i != d0_idx);
+        BUG_ON(count != syndrome_disks);
+        raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
        switch(method) {
        case RECONSTRUCT_WRITE:
@@ -1552,8 +1744,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 {
        int i, count, disks = sh->disks;
        void *ptr[MAX_XOR_BLOCKS], *dest, *p;
-        int pd_idx = sh->pd_idx;
+        int qd_idx = sh->qd_idx;
-        int qd_idx = raid6_next_disk(pd_idx, disks);
        pr_debug("compute_block_1, stripe %llu, idx %d\n",
                (unsigned long long)sh->sector, dd_idx);
@@ -1589,63 +1780,65 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
 {
        int i, count, disks = sh->disks;
-        int pd_idx = sh->pd_idx;
+        int syndrome_disks = sh->ddf_layout ? disks : disks-2;
-        int qd_idx = raid6_next_disk(pd_idx, disks);
+        int d0_idx = raid6_d0(sh);
-        int d0_idx = raid6_next_disk(qd_idx, disks);
+        int faila = -1, failb = -1;
-        int faila, failb;
+        /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+        void *ptrs[syndrome_disks+2];
-        /* faila and failb are disk numbers relative to d0_idx */
+        for (i = 0; i < disks ; i++)
-        /* pd_idx become disks-2 and qd_idx become disks-1 */
+                ptrs[i] = (void *)raid6_empty_zero_page;
-        faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
+        count = 0;
-        failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
+        i = d0_idx;
+        do {
+                int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+                ptrs[slot] = page_address(sh->dev[i].page);
+                if (i == dd_idx1)
+                        faila = slot;
+                if (i == dd_idx2)
+                        failb = slot;
+                i = raid6_next_disk(i, disks);
+        } while (i != d0_idx);
+        BUG_ON(count != syndrome_disks);
        BUG_ON(faila == failb);
        if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
        pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
-               (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
+                 (unsigned long long)sh->sector, dd_idx1, dd_idx2,
+                 faila, failb);
-        if ( failb == disks-1 ) {
+        if (failb == syndrome_disks+1) {
                /* Q disk is one of the missing disks */
-                if ( faila == disks-2 ) {
+                if (faila == syndrome_disks) {
                        /* Missing P+Q, just recompute */
                        compute_parity6(sh, UPDATE_PARITY);
                        return;
                } else {
                        /* We're missing D+Q; recompute D from P */
-                        compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
+                        compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
+                                             dd_idx2 : dd_idx1),
+                                        0);
                        compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
                        return;
                }
        }
-        /* We're missing D+P or D+D; build pointer table */
+        /* We're missing D+P or D+D; */
-        {
+        if (failb == syndrome_disks) {
-                /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+                /* We're missing D+P. */
-                void *ptrs[disks];
+                raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
+        } else {
-                count = 0;
+                /* We're missing D+D. */
-                i = d0_idx;
+                raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
-                do {
+                                  ptrs);
-                        ptrs[count++] = page_address(sh->dev[i].page);
-                        i = raid6_next_disk(i, disks);
-                        if (i != dd_idx1 && i != dd_idx2 &&
-                            !test_bit(R5_UPTODATE, &sh->dev[i].flags))
-                                printk("compute_2 with missing block %d/%d\n", count, i);
-                } while ( i != d0_idx );
-                if ( failb == disks-2 ) {
-                        /* We're missing D+P. */
-                        raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
-                } else {
-                        /* We're missing D+D. */
-                        raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
-                }
-                /* Both the above update both missing blocks */
-                set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
-                set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
        }
+        /* Both the above update both missing blocks */
+        set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
+        set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
 }
 static void
@@ -1800,17 +1993,21 @@ static int page_is_zero(struct page *p)
                memcmp(a, a+4, STRIPE_SIZE-4)==0);
 }
-static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
+static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
+                            struct stripe_head *sh)
 {
-        int sectors_per_chunk = conf->chunk_size >> 9;
+        int sectors_per_chunk =
-        int pd_idx, dd_idx;
+                previous ? (conf->prev_chunk >> 9)
+                         : (conf->chunk_size >> 9);
+        int dd_idx;
        int chunk_offset = sector_div(stripe, sectors_per_chunk);
+        int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
-        raid5_compute_sector(stripe * (disks - conf->max_degraded)
+        raid5_compute_sector(conf,
+                             stripe * (disks - conf->max_degraded)
                             *sectors_per_chunk + chunk_offset,
-                             disks, disks - conf->max_degraded,
+                             previous,
-                             &dd_idx, &pd_idx, conf);
+                             &dd_idx, sh);
-        return pd_idx;
 }
 static void
@@ -2181,7 +2378,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf,
                struct r6_state *r6s, int disks)
 {
        int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
-        int qd_idx = r6s->qd_idx;
+        int qd_idx = sh->qd_idx;
        for (i = disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
                /* Would I have to read this buffer for reconstruct_write */
@@ -2371,7 +2568,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
        int update_p = 0, update_q = 0;
        struct r5dev *dev;
        int pd_idx = sh->pd_idx;
-        int qd_idx = r6s->qd_idx;
+        int qd_idx = sh->qd_idx;
        set_bit(STRIPE_HANDLE, &sh->state);
@@ -2467,17 +2664,14 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
        struct dma_async_tx_descriptor *tx = NULL;
        clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
        for (i = 0; i < sh->disks; i++)
-                if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) {
+                if (i != sh->pd_idx && i != sh->qd_idx) {
-                        int dd_idx, pd_idx, j;
+                        int dd_idx, j;
                        struct stripe_head *sh2;
-                        sector_t bn = compute_blocknr(sh, i);
+                        sector_t bn = compute_blocknr(sh, i, 1);
-                        sector_t s = raid5_compute_sector(bn, conf->raid_disks,
+                        sector_t s = raid5_compute_sector(conf, bn, 0,
-                                                conf->raid_disks -
+                                                          &dd_idx, NULL);
-                                                conf->max_degraded, &dd_idx,
+                        sh2 = get_active_stripe(conf, s, 0, 1);
-                                                &pd_idx, conf);
-                        sh2 = get_active_stripe(conf, s, conf->raid_disks,
-                                                pd_idx, 1);
                        if (sh2 == NULL)
                                /* so far only the early blocks of this stripe
                                 * have been requested.  When later blocks
@@ -2500,8 +2694,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
                        set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
                        for (j = 0; j < conf->raid_disks; j++)
                                if (j != sh2->pd_idx &&
-                                    (!r6s || j != raid6_next_disk(sh2->pd_idx,
+                                    (!r6s || j != sh2->qd_idx) &&
-                                                                 sh2->disks)) &&
                                    !test_bit(R5_Expanded, &sh2->dev[j].flags))
                                        break;
                        if (j == conf->raid_disks) {
@@ -2750,6 +2943,23 @@ static bool handle_stripe5(struct stripe_head *sh)
        /* Finish reconstruct operations initiated by the expansion process */
        if (sh->reconstruct_state == reconstruct_state_result) {
+                struct stripe_head *sh2
+                        = get_active_stripe(conf, sh->sector, 1, 1);
+                if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+                        /* sh cannot be written until sh2 has been read.
+                         * so arrange for sh to be delayed a little
+                         */
+                        set_bit(STRIPE_DELAYED, &sh->state);
+                        set_bit(STRIPE_HANDLE, &sh->state);
+                        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+                                              &sh2->state))
+                                atomic_inc(&conf->preread_active_stripes);
+                        release_stripe(sh2);
+                        goto unlock;
+                }
+                if (sh2)
+                        release_stripe(sh2);
                sh->reconstruct_state = reconstruct_state_idle;
                clear_bit(STRIPE_EXPANDING, &sh->state);
                for (i = conf->raid_disks; i--; ) {
@@ -2763,8 +2973,7 @@ static bool handle_stripe5(struct stripe_head *sh)
            !sh->reconstruct_state) {
                /* Need to write out all blocks after computing parity */
                sh->disks = conf->raid_disks;
-                sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
+                stripe_set_idx(sh->sector, conf, 0, sh);
-                        conf->raid_disks);
                schedule_reconstruction5(sh, &s, 1, 1);
        } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
                clear_bit(STRIPE_EXPAND_READY, &sh->state);
@@ -2796,20 +3005,19 @@ static bool handle_stripe5(struct stripe_head *sh)
 static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 {
-        raid6_conf_t *conf = sh->raid_conf;
+        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks;
        struct bio *return_bi = NULL;
-        int i, pd_idx = sh->pd_idx;
+        int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
        struct stripe_head_state s;
        struct r6_state r6s;
        struct r5dev *dev, *pdev, *qdev;
        mdk_rdev_t *blocked_rdev = NULL;
-        r6s.qd_idx = raid6_next_disk(pd_idx, disks);
        pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
                "pd_idx=%d, qd_idx=%d\n",
               (unsigned long long)sh->sector, sh->state,
-               atomic_read(&sh->count), pd_idx, r6s.qd_idx);
+               atomic_read(&sh->count), pd_idx, qd_idx);
        memset(&s, 0, sizeof(s));
        spin_lock(&sh->lock);
@@ -2920,9 +3128,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
        pdev = &sh->dev[pd_idx];
        r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
                || (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
-        qdev = &sh->dev[r6s.qd_idx];
+        qdev = &sh->dev[qd_idx];
-        r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
+        r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx)
-                || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
+                || (s.failed >= 2 && r6s.failed_num[1] == qd_idx);
        if ( s.written &&
             ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
@@ -2980,10 +3188,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                }
        if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+                struct stripe_head *sh2
+                        = get_active_stripe(conf, sh->sector, 1, 1);
+                if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+                        /* sh cannot be written until sh2 has been read.
+                         * so arrange for sh to be delayed a little
+                         */
+                        set_bit(STRIPE_DELAYED, &sh->state);
+                        set_bit(STRIPE_HANDLE, &sh->state);
+                        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+                                              &sh2->state))
+                                atomic_inc(&conf->preread_active_stripes);
+                        release_stripe(sh2);
+                        goto unlock;
+                }
+                if (sh2)
+                        release_stripe(sh2);
                /* Need to write out all blocks after computing P&Q */
                sh->disks = conf->raid_disks;
-                sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
+                stripe_set_idx(sh->sector, conf, 0, sh);
-                                             conf->raid_disks);
                compute_parity6(sh, RECONSTRUCT_WRITE);
                for (i = conf->raid_disks ; i-- ;  ) {
                        set_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -3134,6 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q,
        if ((bvm->bi_rw & 1) == WRITE)
                return biovec->bv_len; /* always allow writes to be mergeable */
+        if (mddev->new_chunk < mddev->chunk_size)
+                chunk_sectors = mddev->new_chunk >> 9;
        max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
        if (max < 0) max = 0;
        if (max <= biovec->bv_len && bio_sectors == 0)
@@ -3149,6 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
        unsigned int chunk_sectors = mddev->chunk_size >> 9;
        unsigned int bio_sectors = bio->bi_size >> 9;
+        if (mddev->new_chunk < mddev->chunk_size)
+                chunk_sectors = mddev->new_chunk >> 9;
        return  chunk_sectors >=
                ((sector & (chunk_sectors - 1)) + bio_sectors);
 }
@@ -3255,9 +3483,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 {
        mddev_t *mddev = q->queuedata;
        raid5_conf_t *conf = mddev_to_conf(mddev);
-        const unsigned int raid_disks = conf->raid_disks;
+        unsigned int dd_idx;
-        const unsigned int data_disks = raid_disks - conf->max_degraded;
-        unsigned int dd_idx, pd_idx;
        struct bio* align_bi;
        mdk_rdev_t *rdev;
@@ -3266,7 +3492,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
                return 0;
        }
        /*
-         * use bio_clone to make a copy of the bio
+         * use bio_clone to make a copy of the bio
         */
        align_bi = bio_clone(raid_bio, GFP_NOIO);
        if (!align_bi)
@@ -3280,12 +3506,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
        /*
         *      compute position
         */
-        align_bi->bi_sector =  raid5_compute_sector(raid_bio->bi_sector,
+        align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector,
-                                        raid_disks,
+                                                    0,
-                                        data_disks,
+                                                    &dd_idx, NULL);
-                                        &dd_idx,
-                                        &pd_idx,
-                                        conf);
        rcu_read_lock();
        rdev = rcu_dereference(conf->disks[dd_idx].rdev);
@@ -3377,7 +3600,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 {
        mddev_t *mddev = q->queuedata;
        raid5_conf_t *conf = mddev_to_conf(mddev);
-        unsigned int dd_idx, pd_idx;
+        int dd_idx;
        sector_t new_sector;
        sector_t logical_sector, last_sector;
        struct stripe_head *sh;
@@ -3400,7 +3623,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
        if (rw == READ &&
             mddev->reshape_position == MaxSector &&
             chunk_aligned_read(q,bi))
-                return 0;
+                return 0;
        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
        last_sector = bi->bi_sector + (bi->bi_size>>9);
@@ -3410,26 +3633,31 @@ static int make_request(struct request_queue *q, struct bio * bi)
        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
                DEFINE_WAIT(w);
                int disks, data_disks;
+                int previous;
        retry:
+                previous = 0;
+                disks = conf->raid_disks;
                prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-                if (likely(conf->expand_progress == MaxSector))
+                if (unlikely(conf->reshape_progress != MaxSector)) {
-                        disks = conf->raid_disks;
+                        /* spinlock is needed as reshape_progress may be
-                else {
-                        /* spinlock is needed as expand_progress may be
                         * 64bit on a 32bit platform, and so it might be
                         * possible to see a half-updated value
-                         * Ofcourse expand_progress could change after
+                         * Ofcourse reshape_progress could change after
                         * the lock is dropped, so once we get a reference
                         * to the stripe that we think it is, we will have
                         * to check again.
                         */
                        spin_lock_irq(&conf->device_lock);
-                        disks = conf->raid_disks;
+                        if (mddev->delta_disks < 0
-                        if (logical_sector >= conf->expand_progress)
+                            ? logical_sector < conf->reshape_progress
+                            : logical_sector >= conf->reshape_progress) {
                                disks = conf->previous_raid_disks;
-                        else {
+                                previous = 1;
-                                if (logical_sector >= conf->expand_lo) {
+                        } else {
+                                if (mddev->delta_disks < 0
+                                    ? logical_sector < conf->reshape_safe
+                                    : logical_sector >= conf->reshape_safe) {
                                        spin_unlock_irq(&conf->device_lock);
                                        schedule();
                                        goto retry;
@@ -3439,15 +3667,17 @@ static int make_request(struct request_queue *q, struct bio * bi)
                }
                data_disks = disks - conf->max_degraded;
-                new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
+                new_sector = raid5_compute_sector(conf, logical_sector,
-                                                  &dd_idx, &pd_idx, conf);
+                                                  previous,
+                                                  &dd_idx, NULL);
                pr_debug("raid5: make_request, sector %llu logical %llu\n",
                        (unsigned long long)new_sector, 
                        (unsigned long long)logical_sector);
-                sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
+                sh = get_active_stripe(conf, new_sector, previous,
+                                       (bi->bi_rw&RWA_MASK));
                if (sh) {
-                        if (unlikely(conf->expand_progress != MaxSector)) {
+                        if (unlikely(previous)) {
                                /* expansion might have moved on while waiting for a
                                 * stripe, so we must do the range check again.
                                 * Expansion could still move past after this
@@ -3458,8 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
                                 */
                                int must_retry = 0;
                                spin_lock_irq(&conf->device_lock);
-                                if (logical_sector <  conf->expand_progress &&
+                                if (mddev->delta_disks < 0
-                                    disks == conf->previous_raid_disks)
+                                    ? logical_sector >= conf->reshape_progress
+                                    : logical_sector < conf->reshape_progress)
                                        /* mismatch, need to try again */
                                        must_retry = 1;
                                spin_unlock_irq(&conf->device_lock);
@@ -3514,6 +3745,8 @@ static int make_request(struct request_queue *q, struct bio * bi)
        return 0;
 }
+static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
 static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
 {
        /* reshaping is quite different to recovery/resync so it is
@@ -3527,61 +3760,118 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
         */
        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
        struct stripe_head *sh;
-        int pd_idx;
        sector_t first_sector, last_sector;
        int raid_disks = conf->previous_raid_disks;
        int data_disks = raid_disks - conf->max_degraded;
        int new_data_disks = conf->raid_disks - conf->max_degraded;
        int i;
        int dd_idx;
-        sector_t writepos, safepos, gap;
+        sector_t writepos, readpos, safepos;
+        sector_t stripe_addr;
-        if (sector_nr == 0 &&
+        int reshape_sectors;
-            conf->expand_progress != 0) {
+        struct list_head stripes;
-                /* restarting in the middle, skip the initial sectors */
-                sector_nr = conf->expand_progress;
+        if (sector_nr == 0) {
+                /* If restarting in the middle, skip the initial sectors */
+                if (mddev->delta_disks < 0 &&
+                    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
+                        sector_nr = raid5_size(mddev, 0, 0)
+                                - conf->reshape_progress;
+                } else if (mddev->delta_disks > 0 &&
+                           conf->reshape_progress > 0)
+                        sector_nr = conf->reshape_progress;
                sector_div(sector_nr, new_data_disks);
-                *skipped = 1;
+                if (sector_nr) {
-                return sector_nr;
+                        *skipped = 1;
+                        return sector_nr;
+                }
        }
+        /* We need to process a full chunk at a time.
+         * If old and new chunk sizes differ, we need to process the
+         * largest of these
+         */
+        if (mddev->new_chunk > mddev->chunk_size)
+                reshape_sectors = mddev->new_chunk / 512;
+        else
+                reshape_sectors = mddev->chunk_size / 512;
        /* we update the metadata when there is more than 3Meg
         * in the block range (that is rather arbitrary, should
         * probably be time based) or when the data about to be
         * copied would over-write the source of the data at
         * the front of the range.
-         * i.e. one new_stripe forward from expand_progress new_maps
+         * i.e. one new_stripe along from reshape_progress new_maps
-         * to after where expand_lo old_maps to
+         * to after where reshape_safe old_maps to
         */
-        writepos = conf->expand_progress +
+        writepos = conf->reshape_progress;
-                conf->chunk_size/512*(new_data_disks);
        sector_div(writepos, new_data_disks);
-        safepos = conf->expand_lo;
+        readpos = conf->reshape_progress;
+        sector_div(readpos, data_disks);
+        safepos = conf->reshape_safe;
        sector_div(safepos, data_disks);
-        gap = conf->expand_progress - conf->expand_lo;
+        if (mddev->delta_disks < 0) {
+                writepos -= reshape_sectors;
+                readpos += reshape_sectors;
+                safepos += reshape_sectors;
+        } else {
+                writepos += reshape_sectors;
+                readpos -= reshape_sectors;
+                safepos -= reshape_sectors;
+        }
-        if (writepos >= safepos ||
+        /* 'writepos' is the most advanced device address we might write.
-            gap > (new_data_disks)*3000*2 /*3Meg*/) {
+         * 'readpos' is the least advanced device address we might read.
+         * 'safepos' is the least address recorded in the metadata as having
+         *     been reshaped.
+         * If 'readpos' is behind 'writepos', then there is no way that we can
+         * ensure safety in the face of a crash - that must be done by userspace
+         * making a backup of the data.  So in that case there is no particular
+         * rush to update metadata.
+         * Otherwise if 'safepos' is behind 'writepos', then we really need to
+         * update the metadata to advance 'safepos' to match 'readpos' so that
+         * we can be safe in the event of a crash.
+         * So we insist on updating metadata if safepos is behind writepos and
+         * readpos is beyond writepos.
+         * In any case, update the metadata every 10 seconds.
+         * Maybe that number should be configurable, but I'm not sure it is
+         * worth it.... maybe it could be a multiple of safemode_delay???
+         */
+        if ((mddev->delta_disks < 0
+             ? (safepos > writepos && readpos < writepos)
+             : (safepos < writepos && readpos > writepos)) ||
+            time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
                /* Cannot proceed until we've updated the superblock... */
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes)==0);
-                mddev->reshape_position = conf->expand_progress;
+                mddev->reshape_position = conf->reshape_progress;
+                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait, mddev->flags == 0 ||
                           kthread_should_stop());
                spin_lock_irq(&conf->device_lock);
-                conf->expand_lo = mddev->reshape_position;
+                conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
        }
-        for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
+        if (mddev->delta_disks < 0) {
+                BUG_ON(conf->reshape_progress == 0);
+                stripe_addr = writepos;
+                BUG_ON((mddev->dev_sectors &
+                        ~((sector_t)reshape_sectors - 1))
+                       - reshape_sectors - stripe_addr
+                       != sector_nr);
+        } else {
+                BUG_ON(writepos != sector_nr + reshape_sectors);
+                stripe_addr = sector_nr;
+        }
+        INIT_LIST_HEAD(&stripes);
+        for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
                int j;
                int skipped = 0;
-                pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
+                sh = get_active_stripe(conf, stripe_addr+i, 0, 0);
-                sh = get_active_stripe(conf, sector_nr+i,
-                                       conf->raid_disks, pd_idx, 0);
                set_bit(STRIPE_EXPANDING, &sh->state);
                atomic_inc(&conf->reshape_stripes);
                /* If any of this stripe is beyond the end of the old
@@ -3592,10 +3882,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                        if (j == sh->pd_idx)
                                continue;
                        if (conf->level == 6 &&
-                            j == raid6_next_disk(sh->pd_idx, sh->disks))
+                            j == sh->qd_idx)
                                continue;
-                        s = compute_blocknr(sh, j);
+                        s = compute_blocknr(sh, j, 0);
-                        if (s < mddev->array_sectors) {
+                        if (s < raid5_size(mddev, 0, 0)) {
                                skipped = 1;
                                continue;
                        }
@@ -3607,10 +3897,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                        set_bit(STRIPE_EXPAND_READY, &sh->state);
                        set_bit(STRIPE_HANDLE, &sh->state);
                }
-                release_stripe(sh);
+                list_add(&sh->lru, &stripes);
        }
        spin_lock_irq(&conf->device_lock);
-        conf->expand_progress = (sector_nr + i) * new_data_disks;
+        if (mddev->delta_disks < 0)
+                conf->reshape_progress -= reshape_sectors * new_data_disks;
+        else
+                conf->reshape_progress += reshape_sectors * new_data_disks;
        spin_unlock_irq(&conf->device_lock);
        /* Ok, those stripe are ready. We can start scheduling
         * reads on the source stripes.
@@ -3618,46 +3911,50 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
         * block on the destination stripes.
         */
        first_sector =
-                raid5_compute_sector(sector_nr*(new_data_disks),
+                raid5_compute_sector(conf, stripe_addr*(new_data_disks),
-                                     raid_disks, data_disks,
+                                     1, &dd_idx, NULL);
-                                     &dd_idx, &pd_idx, conf);
        last_sector =
-                raid5_compute_sector((sector_nr+conf->chunk_size/512)
+                raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512)
-                                     *(new_data_disks) -1,
+                                            *(new_data_disks) - 1),
-                                     raid_disks, data_disks,
+                                     1, &dd_idx, NULL);
-                                     &dd_idx, &pd_idx, conf);
+        if (last_sector >= mddev->dev_sectors)
-        if (last_sector >= (mddev->size<<1))
+                last_sector = mddev->dev_sectors - 1;
-                last_sector = (mddev->size<<1)-1;
        while (first_sector <= last_sector) {
-                pd_idx = stripe_to_pdidx(first_sector, conf,
+                sh = get_active_stripe(conf, first_sector, 1, 0);
-                                         conf->previous_raid_disks);
-                sh = get_active_stripe(conf, first_sector,
-                                       conf->previous_raid_disks, pd_idx, 0);
                set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
                set_bit(STRIPE_HANDLE, &sh->state);
                release_stripe(sh);
                first_sector += STRIPE_SECTORS;
        }
+        /* Now that the sources are clearly marked, we can release
+         * the destination stripes
+         */
+        while (!list_empty(&stripes)) {
+                sh = list_entry(stripes.next, struct stripe_head, lru);
+                list_del_init(&sh->lru);
+                release_stripe(sh);
+        }
        /* If this takes us to the resync_max point where we have to pause,
         * then we need to write out the superblock.
         */
-        sector_nr += conf->chunk_size>>9;
+        sector_nr += reshape_sectors;
        if (sector_nr >= mddev->resync_max) {
                /* Cannot proceed until we've updated the superblock... */
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes) == 0);
-                mddev->reshape_position = conf->expand_progress;
+                mddev->reshape_position = conf->reshape_progress;
+                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait,
                           !test_bit(MD_CHANGE_DEVS, &mddev->flags)
                           || kthread_should_stop());
                spin_lock_irq(&conf->device_lock);
-                conf->expand_lo = mddev->reshape_position;
+                conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
        }
-        return conf->chunk_size>>9;
+        return reshape_sectors;
 }
 /* FIXME go_faster isn't used */
@@ -3665,9 +3962,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 {
        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
        struct stripe_head *sh;
-        int pd_idx;
+        sector_t max_sector = mddev->dev_sectors;
-        int raid_disks = conf->raid_disks;
-        sector_t max_sector = mddev->size << 1;
        int sync_blocks;
        int still_degraded = 0;
        int i;
@@ -3675,6 +3970,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
        if (sector_nr >= max_sector) {
                /* just being told to finish up .. nothing much to do */
                unplug_slaves(mddev);
                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
                        end_reshape(conf);
                        return 0;
@@ -3705,7 +4001,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
         */
        if (mddev->degraded >= conf->max_degraded &&
            test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-                sector_t rv = (mddev->size << 1) - sector_nr;
+                sector_t rv = mddev->dev_sectors - sector_nr;
                *skipped = 1;
                return rv;
        }
@@ -3721,10 +4017,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
        bitmap_cond_end_sync(mddev->bitmap, sector_nr);
-        pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
+        sh = get_active_stripe(conf, sector_nr, 0, 1);
-        sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
        if (sh == NULL) {
-                sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
+                sh = get_active_stripe(conf, sector_nr, 0, 0);
                /* make sure we don't swamp the stripe cache if someone else
                 * is trying to get access
                 */
@@ -3766,19 +4061,15 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
         * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
         */
        struct stripe_head *sh;
-        int dd_idx, pd_idx;
+        int dd_idx;
        sector_t sector, logical_sector, last_sector;
        int scnt = 0;
        int remaining;
        int handled = 0;
        logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-        sector = raid5_compute_sector(  logical_sector,
+        sector = raid5_compute_sector(conf, logical_sector,
-                                        conf->raid_disks,
+                                      0, &dd_idx, NULL);
-                                        conf->raid_disks - conf->max_degraded,
-                                        &dd_idx,
-                                        &pd_idx,
-                                        conf);
        last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
        for (; logical_sector < last_sector;
@@ -3790,7 +4081,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
                        /* already done this stripe */
                        continue;
-                sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1);
+                sh = get_active_stripe(conf, sector, 0, 1);
                if (!sh) {
                        /* failed to get a stripe - must wait */
@@ -3992,89 +4283,69 @@ static struct attribute_group raid5_attrs_group = {
        .attrs = raid5_attrs,
 };
-static int run(mddev_t *mddev)
+static sector_t
+raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+        raid5_conf_t *conf = mddev_to_conf(mddev);
+        if (!sectors)
+                sectors = mddev->dev_sectors;
+        if (!raid_disks) {
+                /* size is defined by the smallest of previous and new size */
+                if (conf->raid_disks < conf->previous_raid_disks)
+                        raid_disks = conf->raid_disks;
+                else
+                        raid_disks = conf->previous_raid_disks;
+        }
+        sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+        sectors &= ~((sector_t)mddev->new_chunk/512 - 1);
+        return sectors * (raid_disks - conf->max_degraded);
+}
+static raid5_conf_t *setup_conf(mddev_t *mddev)
 {
        raid5_conf_t *conf;
        int raid_disk, memory;
        mdk_rdev_t *rdev;
        struct disk_info *disk;
-        int working_disks = 0;
-        if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
+        if (mddev->new_level != 5
+            && mddev->new_level != 4
+            && mddev->new_level != 6) {
                printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
-                       mdname(mddev), mddev->level);
+                       mdname(mddev), mddev->new_level);
-                return -EIO;
+                return ERR_PTR(-EIO);
        }
+        if ((mddev->new_level == 5
-        if (mddev->chunk_size < PAGE_SIZE) {
+             && !algorithm_valid_raid5(mddev->new_layout)) ||
-                printk(KERN_ERR "md/raid5: chunk_size must be at least "
+            (mddev->new_level == 6
-                       "PAGE_SIZE but %d < %ld\n",
+             && !algorithm_valid_raid6(mddev->new_layout))) {
-                       mddev->chunk_size, PAGE_SIZE);
+                printk(KERN_ERR "raid5: %s: layout %d not supported\n",
-                return -EINVAL;
+                       mdname(mddev), mddev->new_layout);
+                return ERR_PTR(-EIO);
        }
+        if (mddev->new_level == 6 && mddev->raid_disks < 4) {
-        if (mddev->reshape_position != MaxSector) {
+                printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
-                /* Check that we can continue the reshape.
+                       mdname(mddev), mddev->raid_disks);
-                 * Currently only disks can change, it must
+                return ERR_PTR(-EINVAL);
-                 * increase, and we must be past the point where
-                 * a stripe over-writes itself
-                 */
-                sector_t here_new, here_old;
-                int old_disks;
-                int max_degraded = (mddev->level == 5 ? 1 : 2);
-                if (mddev->new_level != mddev->level ||
-                    mddev->new_layout != mddev->layout ||
-                    mddev->new_chunk != mddev->chunk_size) {
-                        printk(KERN_ERR "raid5: %s: unsupported reshape "
-                               "required - aborting.\n",
-                               mdname(mddev));
-                        return -EINVAL;
-                }
-                if (mddev->delta_disks <= 0) {
-                        printk(KERN_ERR "raid5: %s: unsupported reshape "
-                               "(reduce disks) required - aborting.\n",
-                               mdname(mddev));
-                        return -EINVAL;
-                }
-                old_disks = mddev->raid_disks - mddev->delta_disks;
-                /* reshape_position must be on a new-stripe boundary, and one
-                 * further up in new geometry must map after here in old
-                 * geometry.
-                 */
-                here_new = mddev->reshape_position;
-                if (sector_div(here_new, (mddev->chunk_size>>9)*
-                               (mddev->raid_disks - max_degraded))) {
-                        printk(KERN_ERR "raid5: reshape_position not "
-                               "on a stripe boundary\n");
-                        return -EINVAL;
-                }
-                /* here_new is the stripe we will write to */
-                here_old = mddev->reshape_position;
-                sector_div(here_old, (mddev->chunk_size>>9)*
-                           (old_disks-max_degraded));
-                /* here_old is the first stripe that we might need to read
-                 * from */
-                if (here_new >= here_old) {
-                        /* Reading from the same stripe as writing to - bad */
-                        printk(KERN_ERR "raid5: reshape_position too early for "
-                               "auto-recovery - aborting.\n");
-                        return -EINVAL;
-                }
-                printk(KERN_INFO "raid5: reshape will continue\n");
-                /* OK, we should be able to continue; */
        }
+        if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) {
+                printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
+                        mddev->new_chunk, mdname(mddev));
+                return ERR_PTR(-EINVAL);
+        }
-        mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
+        conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
-        if ((conf = mddev->private) == NULL)
+        if (conf == NULL)
                goto abort;
-        if (mddev->reshape_position == MaxSector) {
-                conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
+        conf->raid_disks = mddev->raid_disks;
-        } else {
+        if (mddev->reshape_position == MaxSector)
-                conf->raid_disks = mddev->raid_disks;
+                conf->previous_raid_disks = mddev->raid_disks;
+        else
                conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
-        }
        conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
                              GFP_KERNEL);
@@ -4086,13 +4357,12 @@ static int run(mddev_t *mddev)
        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
                goto abort;
-        if (mddev->level == 6) {
+        if (mddev->new_level == 6) {
                conf->spare_page = alloc_page(GFP_KERNEL);
                if (!conf->spare_page)
                        goto abort;
        }
        spin_lock_init(&conf->device_lock);
-        mddev->queue->queue_lock = &conf->device_lock;
        init_waitqueue_head(&conf->wait_for_stripe);
        init_waitqueue_head(&conf->wait_for_overlap);
        INIT_LIST_HEAD(&conf->handle_list);
@@ -4121,47 +4391,134 @@ static int run(mddev_t *mddev)
                        printk(KERN_INFO "raid5: device %s operational as raid"
                                " disk %d\n", bdevname(rdev->bdev,b),
                                raid_disk);
-                        working_disks++;
                } else
                        /* Cannot rely on bitmap to complete recovery */
                        conf->fullsync = 1;
        }
-        /*
+        conf->chunk_size = mddev->new_chunk;
-         * 0 for a fully functional array, 1 or 2 for a degraded array.
+        conf->level = mddev->new_level;
-         */
-        mddev->degraded = conf->raid_disks - working_disks;
-        conf->mddev = mddev;
-        conf->chunk_size = mddev->chunk_size;
-        conf->level = mddev->level;
        if (conf->level == 6)
                conf->max_degraded = 2;
        else
                conf->max_degraded = 1;
-        conf->algorithm = mddev->layout;
+        conf->algorithm = mddev->new_layout;
        conf->max_nr_stripes = NR_STRIPES;
-        conf->expand_progress = mddev->reshape_position;
+        conf->reshape_progress = mddev->reshape_position;
+        if (conf->reshape_progress != MaxSector) {
-        /* device size must be a multiple of chunk size */
+                conf->prev_chunk = mddev->chunk_size;
-        mddev->size &= ~(mddev->chunk_size/1024 -1);
+                conf->prev_algo = mddev->layout;
-        mddev->resync_max_sectors = mddev->size << 1;
+        }
-        if (conf->level == 6 && conf->raid_disks < 4) {
+        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
-                printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
+                 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
-                       mdname(mddev), conf->raid_disks);
+        if (grow_stripes(conf, conf->max_nr_stripes)) {
+                printk(KERN_ERR
+                        "raid5: couldn't allocate %dkB for buffers\n", memory);
                goto abort;
-        }
+        } else
-        if (!conf->chunk_size || conf->chunk_size % 4) {
+                printk(KERN_INFO "raid5: allocated %dkB for %s\n",
-                printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
+                        memory, mdname(mddev));
-                        conf->chunk_size, mdname(mddev));
+        conf->thread = md_register_thread(raid5d, mddev, "%s_raid5");
+        if (!conf->thread) {
+                printk(KERN_ERR
+                       "raid5: couldn't allocate thread for %s\n",
+                       mdname(mddev));
                goto abort;
        }
-        if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
-                printk(KERN_ERR 
+        return conf;
-                        "raid5: unsupported parity algorithm %d for %s\n",
-                        conf->algorithm, mdname(mddev));
+ abort:
-                goto abort;
+        if (conf) {
+                shrink_stripes(conf);
+                safe_put_page(conf->spare_page);
+                kfree(conf->disks);
+                kfree(conf->stripe_hashtbl);
+                kfree(conf);
+                return ERR_PTR(-EIO);
+        } else
+                return ERR_PTR(-ENOMEM);
+}
+static int run(mddev_t *mddev)
+{
+        raid5_conf_t *conf;
+        int working_disks = 0;
+        mdk_rdev_t *rdev;
+        if (mddev->reshape_position != MaxSector) {
+                /* Check that we can continue the reshape.
+                 * Currently only disks can change, it must
+                 * increase, and we must be past the point where
+                 * a stripe over-writes itself
+                 */
+                sector_t here_new, here_old;
+                int old_disks;
+                int max_degraded = (mddev->level == 6 ? 2 : 1);
+                if (mddev->new_level != mddev->level) {
+                        printk(KERN_ERR "raid5: %s: unsupported reshape "
+                               "required - aborting.\n",
+                               mdname(mddev));
+                        return -EINVAL;
+                }
+                old_disks = mddev->raid_disks - mddev->delta_disks;
+                /* reshape_position must be on a new-stripe boundary, and one
+                 * further up in new geometry must map after here in old
+                 * geometry.
+                 */
+                here_new = mddev->reshape_position;
+                if (sector_div(here_new, (mddev->new_chunk>>9)*
+                               (mddev->raid_disks - max_degraded))) {
+                        printk(KERN_ERR "raid5: reshape_position not "
+                               "on a stripe boundary\n");
+                        return -EINVAL;
+                }
+                /* here_new is the stripe we will write to */
+                here_old = mddev->reshape_position;
+                sector_div(here_old, (mddev->chunk_size>>9)*
+                           (old_disks-max_degraded));
+                /* here_old is the first stripe that we might need to read
+                 * from */
+                if (here_new >= here_old) {
+                        /* Reading from the same stripe as writing to - bad */
+                        printk(KERN_ERR "raid5: reshape_position too early for "
+                               "auto-recovery - aborting.\n");
+                        return -EINVAL;
+                }
+                printk(KERN_INFO "raid5: reshape will continue\n");
+                /* OK, we should be able to continue; */
+        } else {
+                BUG_ON(mddev->level != mddev->new_level);
+                BUG_ON(mddev->layout != mddev->new_layout);
+                BUG_ON(mddev->chunk_size != mddev->new_chunk);
+                BUG_ON(mddev->delta_disks != 0);
        }
+        if (mddev->private == NULL)
+                conf = setup_conf(mddev);
+        else
+                conf = mddev->private;
+        if (IS_ERR(conf))
+                return PTR_ERR(conf);
+        mddev->thread = conf->thread;
+        conf->thread = NULL;
+        mddev->private = conf;
+        /*
+         * 0 for a fully functional array, 1 or 2 for a degraded array.
+         */
+        list_for_each_entry(rdev, &mddev->disks, same_set)
+                if (rdev->raid_disk >= 0 &&
+                    test_bit(In_sync, &rdev->flags))
+                        working_disks++;
+        mddev->degraded = conf->raid_disks - working_disks;
        if (mddev->degraded > conf->max_degraded) {
                printk(KERN_ERR "raid5: not enough operational devices for %s"
                        " (%d/%d failed)\n",
@@ -4169,6 +4526,10 @@ static int run(mddev_t *mddev)
                goto abort;
        }
+        /* device size must be a multiple of chunk size */
+        mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1);
+        mddev->resync_max_sectors = mddev->dev_sectors;
        if (mddev->degraded > 0 &&
            mddev->recovery_cp != MaxSector) {
                if (mddev->ok_start_degraded)
@@ -4184,43 +4545,22 @@ static int run(mddev_t *mddev)
                }
        }
-        {
-                mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
-                if (!mddev->thread) {
-                        printk(KERN_ERR 
-                                "raid5: couldn't allocate thread for %s\n",
-                                mdname(mddev));
-                        goto abort;
-                }
-        }
-        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
-                 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
-        if (grow_stripes(conf, conf->max_nr_stripes)) {
-                printk(KERN_ERR 
-                        "raid5: couldn't allocate %dkB for buffers\n", memory);
-                shrink_stripes(conf);
-                md_unregister_thread(mddev->thread);
-                goto abort;
-        } else
-                printk(KERN_INFO "raid5: allocated %dkB for %s\n",
-                        memory, mdname(mddev));
        if (mddev->degraded == 0)
                printk("raid5: raid level %d set %s active with %d out of %d"
-                        " devices, algorithm %d\n", conf->level, mdname(mddev), 
+                       " devices, algorithm %d\n", conf->level, mdname(mddev),
-                        mddev->raid_disks-mddev->degraded, mddev->raid_disks,
+                       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
-                        conf->algorithm);
+                       mddev->new_layout);
        else
                printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
                        " out of %d devices, algorithm %d\n", conf->level,
                        mdname(mddev), mddev->raid_disks - mddev->degraded,
-                        mddev->raid_disks, conf->algorithm);
+                        mddev->raid_disks, mddev->new_layout);
        print_raid5_conf(conf);
-        if (conf->expand_progress != MaxSector) {
+        if (conf->reshape_progress != MaxSector) {
                printk("...ok start reshape thread\n");
-                conf->expand_lo = conf->expand_progress;
+                conf->reshape_safe = conf->reshape_progress;
                atomic_set(&conf->reshape_stripes, 0);
                clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
                clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -4247,18 +4587,22 @@ static int run(mddev_t *mddev)
                       "raid5: failed to create sysfs attributes for %s\n",
                       mdname(mddev));
+        mddev->queue->queue_lock = &conf->device_lock;
        mddev->queue->unplug_fn = raid5_unplug_device;
        mddev->queue->backing_dev_info.congested_data = mddev;
        mddev->queue->backing_dev_info.congested_fn = raid5_congested;
-        mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
+        md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
-                                            conf->max_degraded);
        blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
        return 0;
 abort:
+        md_unregister_thread(mddev->thread);
+        mddev->thread = NULL;
        if (conf) {
+                shrink_stripes(conf);
                print_raid5_conf(conf);
                safe_put_page(conf->spare_page);
                kfree(conf->disks);
@@ -4396,6 +4740,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
        print_raid5_conf(conf);
        rdev = p->rdev;
        if (rdev) {
+                if (number >= conf->raid_disks &&
+                    conf->reshape_progress == MaxSector)
+                        clear_bit(In_sync, &rdev->flags);
                if (test_bit(In_sync, &rdev->flags) ||
                    atomic_read(&rdev->nr_pending)) {
                        err = -EBUSY;
@@ -4405,7 +4753,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
                 * isn't possible.
                 */
                if (!test_bit(Faulty, &rdev->flags) &&
-                    mddev->degraded <= conf->max_degraded) {
+                    mddev->degraded <= conf->max_degraded &&
+                    number < conf->raid_disks) {
                        err = -EBUSY;
                        goto abort;
                }
@@ -4472,36 +4821,48 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
         * any io in the removed space completes, but it hardly seems
         * worth it.
         */
-        raid5_conf_t *conf = mddev_to_conf(mddev);
        sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
-        mddev->array_sectors = sectors * (mddev->raid_disks
+        md_set_array_sectors(mddev, raid5_size(mddev, sectors,
-                                          - conf->max_degraded);
+                                               mddev->raid_disks));
+        if (mddev->array_sectors >
+            raid5_size(mddev, sectors, mddev->raid_disks))
+                return -EINVAL;
        set_capacity(mddev->gendisk, mddev->array_sectors);
        mddev->changed = 1;
-        if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
+        if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
-                mddev->recovery_cp = mddev->size << 1;
+                mddev->recovery_cp = mddev->dev_sectors;
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        }
-        mddev->size = sectors /2;
+        mddev->dev_sectors = sectors;
        mddev->resync_max_sectors = sectors;
        return 0;
 }
-#ifdef CONFIG_MD_RAID5_RESHAPE
 static int raid5_check_reshape(mddev_t *mddev)
 {
        raid5_conf_t *conf = mddev_to_conf(mddev);
-        int err;
-        if (mddev->delta_disks < 0 ||
+        if (mddev->delta_disks == 0 &&
-            mddev->new_level != mddev->level)
+            mddev->new_layout == mddev->layout &&
-                return -EINVAL; /* Cannot shrink array or change level yet */
+            mddev->new_chunk == mddev->chunk_size)
-        if (mddev->delta_disks == 0)
+                return -EINVAL; /* nothing to do */
-                return 0; /* nothing to do */
        if (mddev->bitmap)
                /* Cannot grow a bitmap yet */
                return -EBUSY;
+        if (mddev->degraded > conf->max_degraded)
+                return -EINVAL;
+        if (mddev->delta_disks < 0) {
+                /* We might be able to shrink, but the devices must
+                 * be made bigger first.
+                 * For raid6, 4 is the minimum size.
+                 * Otherwise 2 is the minimum
+                 */
+                int min = 2;
+                if (mddev->level == 6)
+                        min = 4;
+                if (mddev->raid_disks + mddev->delta_disks < min)
+                        return -EINVAL;
+        }
        /* Can only proceed if there are plenty of stripe_heads.
         * We need a minimum of one full stripe,, and for sensible progress
@@ -4514,18 +4875,12 @@ static int raid5_check_reshape(mddev_t *mddev)
        if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
            (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
                printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
-                       (mddev->chunk_size / STRIPE_SIZE)*4);
+                       (max(mddev->chunk_size, mddev->new_chunk)
+                        / STRIPE_SIZE)*4);
                return -ENOSPC;
        }
-        err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
+        return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
-        if (err)
-                return err;
-        if (mddev->degraded > conf->max_degraded)
-                return -EINVAL;
-        /* looks like we might be able to manage this */
-        return 0;
 }
 static int raid5_start_reshape(mddev_t *mddev)
@@ -4550,12 +4905,31 @@ static int raid5_start_reshape(mddev_t *mddev)
                 */
                return -EINVAL;
+        /* Refuse to reduce size of the array.  Any reductions in
+         * array size must be through explicit setting of array_size
+         * attribute.
+         */
+        if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
+            < mddev->array_sectors) {
+                printk(KERN_ERR "md: %s: array size must be reduced "
+                       "before number of disks\n", mdname(mddev));
+                return -EINVAL;
+        }
        atomic_set(&conf->reshape_stripes, 0);
        spin_lock_irq(&conf->device_lock);
        conf->previous_raid_disks = conf->raid_disks;
        conf->raid_disks += mddev->delta_disks;
-        conf->expand_progress = 0;
+        conf->prev_chunk = conf->chunk_size;
-        conf->expand_lo = 0;
+        conf->chunk_size = mddev->new_chunk;
+        conf->prev_algo = conf->algorithm;
+        conf->algorithm = mddev->new_layout;
+        if (mddev->delta_disks < 0)
+                conf->reshape_progress = raid5_size(mddev, 0, 0);
+        else
+                conf->reshape_progress = 0;
+        conf->reshape_safe = conf->reshape_progress;
+        conf->generation++;
        spin_unlock_irq(&conf->device_lock);
        /* Add some new drives, as many as will fit.
@@ -4580,9 +4954,12 @@ static int raid5_start_reshape(mddev_t *mddev)
                                break;
                }
-        spin_lock_irqsave(&conf->device_lock, flags);
+        if (mddev->delta_disks > 0) {
-        mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
+                spin_lock_irqsave(&conf->device_lock, flags);
-        spin_unlock_irqrestore(&conf->device_lock, flags);
+                mddev->degraded = (conf->raid_disks - conf->previous_raid_disks)
+                        - added_devices;
+                spin_unlock_irqrestore(&conf->device_lock, flags);
+        }
        mddev->raid_disks = conf->raid_disks;
        mddev->reshape_position = 0;
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -4597,52 +4974,86 @@ static int raid5_start_reshape(mddev_t *mddev)
                mddev->recovery = 0;
                spin_lock_irq(&conf->device_lock);
                mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
-                conf->expand_progress = MaxSector;
+                conf->reshape_progress = MaxSector;
                spin_unlock_irq(&conf->device_lock);
                return -EAGAIN;
        }
+        conf->reshape_checkpoint = jiffies;
        md_wakeup_thread(mddev->sync_thread);
        md_new_event(mddev);
        return 0;
 }
-#endif
+/* This is called from the reshape thread and should make any
+ * changes needed in 'conf'
+ */
 static void end_reshape(raid5_conf_t *conf)
 {
-        struct block_device *bdev;
        if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
-                conf->mddev->array_sectors = 2 * conf->mddev->size *
-                        (conf->raid_disks - conf->max_degraded);
-                set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
-                conf->mddev->changed = 1;
-                bdev = bdget_disk(conf->mddev->gendisk, 0);
-                if (bdev) {
-                        mutex_lock(&bdev->bd_inode->i_mutex);
-                        i_size_write(bdev->bd_inode,
-                                     (loff_t)conf->mddev->array_sectors << 9);
-                        mutex_unlock(&bdev->bd_inode->i_mutex);
-                        bdput(bdev);
-                }
                spin_lock_irq(&conf->device_lock);
-                conf->expand_progress = MaxSector;
+                conf->previous_raid_disks = conf->raid_disks;
+                conf->reshape_progress = MaxSector;
                spin_unlock_irq(&conf->device_lock);
-                conf->mddev->reshape_position = MaxSector;
+                wake_up(&conf->wait_for_overlap);
                /* read-ahead size must cover two whole stripes, which is
                 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
                 */
                {
-                        int data_disks = conf->previous_raid_disks - conf->max_degraded;
+                        int data_disks = conf->raid_disks - conf->max_degraded;
-                        int stripe = data_disks *
+                        int stripe = data_disks * (conf->chunk_size
-                                (conf->mddev->chunk_size / PAGE_SIZE);
+                                                   / PAGE_SIZE);
                        if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
                                conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
                }
        }
 }
+/* This is called from the raid5d thread with mddev_lock held.
+ * It makes config changes to the device.
+ */
+static void raid5_finish_reshape(mddev_t *mddev)
+{
+        struct block_device *bdev;
+        raid5_conf_t *conf = mddev_to_conf(mddev);
+        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+                if (mddev->delta_disks > 0) {
+                        md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
+                        set_capacity(mddev->gendisk, mddev->array_sectors);
+                        mddev->changed = 1;
+                        bdev = bdget_disk(mddev->gendisk, 0);
+                        if (bdev) {
+                                mutex_lock(&bdev->bd_inode->i_mutex);
+                                i_size_write(bdev->bd_inode,
+                                             (loff_t)mddev->array_sectors << 9);
+                                mutex_unlock(&bdev->bd_inode->i_mutex);
+                                bdput(bdev);
+                        }
+                } else {
+                        int d;
+                        mddev->degraded = conf->raid_disks;
+                        for (d = 0; d < conf->raid_disks ; d++)
+                                if (conf->disks[d].rdev &&
+                                    test_bit(In_sync,
+                                             &conf->disks[d].rdev->flags))
+                                        mddev->degraded--;
+                        for (d = conf->raid_disks ;
+                             d < conf->raid_disks - mddev->delta_disks;
+                             d++)
+                                raid5_remove_disk(mddev, d);
+                }
+                mddev->layout = conf->algorithm;
+                mddev->chunk_size = conf->chunk_size;
+                mddev->reshape_position = MaxSector;
+                mddev->delta_disks = 0;
+        }
+}
 static void raid5_quiesce(mddev_t *mddev, int state)
 {
        raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4672,6 +5083,212 @@ static void raid5_quiesce(mddev_t *mddev, int state)
        }
 }
+static void *raid5_takeover_raid1(mddev_t *mddev)
+{
+        int chunksect;
+        if (mddev->raid_disks != 2 ||
+            mddev->degraded > 1)
+                return ERR_PTR(-EINVAL);
+        /* Should check if there are write-behind devices? */
+        chunksect = 64*2; /* 64K by default */
+        /* The array must be an exact multiple of chunksize */
+        while (chunksect && (mddev->array_sectors & (chunksect-1)))
+                chunksect >>= 1;
+        if ((chunksect<<9) < STRIPE_SIZE)
+                /* array size does not allow a suitable chunk size */
+                return ERR_PTR(-EINVAL);
+        mddev->new_level = 5;
+        mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
+        mddev->new_chunk = chunksect << 9;
+        return setup_conf(mddev);
+}
+static void *raid5_takeover_raid6(mddev_t *mddev)
+{
+        int new_layout;
+        switch (mddev->layout) {
+        case ALGORITHM_LEFT_ASYMMETRIC_6:
+                new_layout = ALGORITHM_LEFT_ASYMMETRIC;
+                break;
+        case ALGORITHM_RIGHT_ASYMMETRIC_6:
+                new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
+                break;
+        case ALGORITHM_LEFT_SYMMETRIC_6:
+                new_layout = ALGORITHM_LEFT_SYMMETRIC;
+                break;
+        case ALGORITHM_RIGHT_SYMMETRIC_6:
+                new_layout = ALGORITHM_RIGHT_SYMMETRIC;
+                break;
+        case ALGORITHM_PARITY_0_6:
+                new_layout = ALGORITHM_PARITY_0;
+                break;
+        case ALGORITHM_PARITY_N:
+                new_layout = ALGORITHM_PARITY_N;
+                break;
+        default:
+                return ERR_PTR(-EINVAL);
+        }
+        mddev->new_level = 5;
+        mddev->new_layout = new_layout;
+        mddev->delta_disks = -1;
+        mddev->raid_disks -= 1;
+        return setup_conf(mddev);
+}
+static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+{
+        /* For a 2-drive array, the layout and chunk size can be changed
+         * immediately as not restriping is needed.
+         * For larger arrays we record the new value - after validation
+         * to be used by a reshape pass.
+         */
+        raid5_conf_t *conf = mddev_to_conf(mddev);
+        if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
+                return -EINVAL;
+        if (new_chunk > 0) {
+                if (new_chunk & (new_chunk-1))
+                        /* not a power of 2 */
+                        return -EINVAL;
+                if (new_chunk < PAGE_SIZE)
+                        return -EINVAL;
+                if (mddev->array_sectors & ((new_chunk>>9)-1))
+                        /* not factor of array size */
+                        return -EINVAL;
+        }
+        /* They look valid */
+        if (mddev->raid_disks == 2) {
+                if (new_layout >= 0) {
+                        conf->algorithm = new_layout;
+                        mddev->layout = mddev->new_layout = new_layout;
+                }
+                if (new_chunk > 0) {
+                        conf->chunk_size = new_chunk;
+                        mddev->chunk_size = mddev->new_chunk = new_chunk;
+                }
+                set_bit(MD_CHANGE_DEVS, &mddev->flags);
+                md_wakeup_thread(mddev->thread);
+        } else {
+                if (new_layout >= 0)
+                        mddev->new_layout = new_layout;
+                if (new_chunk > 0)
+                        mddev->new_chunk = new_chunk;
+        }
+        return 0;
+}
+static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+{
+        if (new_layout >= 0 && !algorithm_valid_raid6(new_layout))
+                return -EINVAL;
+        if (new_chunk > 0) {
+                if (new_chunk & (new_chunk-1))
+                        /* not a power of 2 */
+                        return -EINVAL;
+                if (new_chunk < PAGE_SIZE)
+                        return -EINVAL;
+                if (mddev->array_sectors & ((new_chunk>>9)-1))
+                        /* not factor of array size */
+                        return -EINVAL;
+        }
+        /* They look valid */
+        if (new_layout >= 0)
+                mddev->new_layout = new_layout;
+        if (new_chunk > 0)
+                mddev->new_chunk = new_chunk;
+        return 0;
+}
+static void *raid5_takeover(mddev_t *mddev)
+{
+        /* raid5 can take over:
+         *  raid0 - if all devices are the same - make it a raid4 layout
+         *  raid1 - if there are two drives.  We need to know the chunk size
+         *  raid4 - trivial - just use a raid4 layout.
+         *  raid6 - Providing it is a *_6 layout
+         *
+         * For now, just do raid1
+         */
+        if (mddev->level == 1)
+                return raid5_takeover_raid1(mddev);
+        if (mddev->level == 4) {
+                mddev->new_layout = ALGORITHM_PARITY_N;
+                mddev->new_level = 5;
+                return setup_conf(mddev);
+        }
+        if (mddev->level == 6)
+                return raid5_takeover_raid6(mddev);
+        return ERR_PTR(-EINVAL);
+}
+static struct mdk_personality raid5_personality;
+static void *raid6_takeover(mddev_t *mddev)
+{
+        /* Currently can only take over a raid5.  We map the
+         * personality to an equivalent raid6 personality
+         * with the Q block at the end.
+         */
+        int new_layout;
+        if (mddev->pers != &raid5_personality)
+                return ERR_PTR(-EINVAL);
+        if (mddev->degraded > 1)
+                return ERR_PTR(-EINVAL);
+        if (mddev->raid_disks > 253)
+                return ERR_PTR(-EINVAL);
+        if (mddev->raid_disks < 3)
+                return ERR_PTR(-EINVAL);
+        switch (mddev->layout) {
+        case ALGORITHM_LEFT_ASYMMETRIC:
+                new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
+                break;
+        case ALGORITHM_RIGHT_ASYMMETRIC:
+                new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
+                break;
+        case ALGORITHM_LEFT_SYMMETRIC:
+                new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
+                break;
+        case ALGORITHM_RIGHT_SYMMETRIC:
+                new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
+                break;
+        case ALGORITHM_PARITY_0:
+                new_layout = ALGORITHM_PARITY_0_6;
+                break;
+        case ALGORITHM_PARITY_N:
+                new_layout = ALGORITHM_PARITY_N;
+                break;
+        default:
+                return ERR_PTR(-EINVAL);
+        }
+        mddev->new_level = 6;
+        mddev->new_layout = new_layout;
+        mddev->delta_disks = 1;
+        mddev->raid_disks += 1;
+        return setup_conf(mddev);
+}
 static struct mdk_personality raid6_personality =
 {
        .name           = "raid6",
@@ -4687,11 +5304,13 @@ static struct mdk_personality raid6_personality =
        .spare_active   = raid5_spare_active,
        .sync_request   = sync_request,
        .resize         = raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+        .size           = raid5_size,
        .check_reshape  = raid5_check_reshape,
        .start_reshape  = raid5_start_reshape,
-#endif
+        .finish_reshape = raid5_finish_reshape,
        .quiesce        = raid5_quiesce,
+        .takeover       = raid6_takeover,
+        .reconfig       = raid6_reconfig,
 };
 static struct mdk_personality raid5_personality =
 {
@@ -4708,11 +5327,13 @@ static struct mdk_personality raid5_personality =
        .spare_active   = raid5_spare_active,
        .sync_request   = sync_request,
        .resize         = raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+        .size           = raid5_size,
        .check_reshape  = raid5_check_reshape,
        .start_reshape  = raid5_start_reshape,
-#endif
+        .finish_reshape = raid5_finish_reshape,
        .quiesce        = raid5_quiesce,
+        .takeover       = raid5_takeover,
+        .reconfig       = raid5_reconfig,
 };
 static struct mdk_personality raid4_personality =
@@ -4730,20 +5351,15 @@ static struct mdk_personality raid4_personality =
        .spare_active   = raid5_spare_active,
        .sync_request   = sync_request,
        .resize         = raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+        .size           = raid5_size,
        .check_reshape  = raid5_check_reshape,
        .start_reshape  = raid5_start_reshape,
-#endif
+        .finish_reshape = raid5_finish_reshape,
        .quiesce        = raid5_quiesce,
 };
 static int __init raid5_init(void)
 {
-        int e;
-        e = raid6_select_algo();
-        if ( e )
-                return e;
        register_md_personality(&raid6_personality);
        register_md_personality(&raid5_personality);
        register_md_personality(&raid4_personality);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
new file mode 100644
index 000000000000..52ba99954dec
--- /dev/null
+++ b/drivers/md/raid5.h
@@ -0,0 +1,474 @@
+#ifndef _RAID5_H
+#define _RAID5_H
+#include <linux/raid/xor.h>
+/*
+ *
+ * Each stripe contains one buffer per disc.  Each buffer can be in
+ * one of a number of states stored in "flags".  Changes between
+ * these states happen *almost* exclusively under a per-stripe
+ * spinlock.  Some very specific changes can happen in bi_end_io, and
+ * these are not protected by the spin lock.
+ *
+ * The flag bits that are used to represent these states are:
+ *   R5_UPTODATE and R5_LOCKED
+ *
+ * State Empty == !UPTODATE, !LOCK
+ *        We have no data, and there is no active request
+ * State Want == !UPTODATE, LOCK
+ *        A read request is being submitted for this block
+ * State Dirty == UPTODATE, LOCK
+ *        Some new data is in this buffer, and it is being written out
+ * State Clean == UPTODATE, !LOCK
+ *        We have valid data which is the same as on disc
+ *
+ * The possible state transitions are:
+ *
+ *  Empty -> Want   - on read or write to get old data for  parity calc
+ *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
+ *  Empty -> Clean  - on compute_block when computing a block for failed drive
+ *  Want  -> Empty  - on failed read
+ *  Want  -> Clean  - on successful completion of read request
+ *  Dirty -> Clean  - on successful completion of write request
+ *  Dirty -> Clean  - on failed write
+ *  Clean -> Dirty  - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
+ *
+ * The Want->Empty, Want->Clean, Dirty->Clean, transitions
+ * all happen in b_end_io at interrupt time.
+ * Each sets the Uptodate bit before releasing the Lock bit.
+ * This leaves one multi-stage transition:
+ *    Want->Dirty->Clean
+ * This is safe because thinking that a Clean buffer is actually dirty
+ * will at worst delay some action, and the stripe will be scheduled
+ * for attention after the transition is complete.
+ *
+ * There is one possibility that is not covered by these states.  That
+ * is if one drive has failed and there is a spare being rebuilt.  We
+ * can't distinguish between a clean block that has been generated
+ * from parity calculations, and a clean block that has been
+ * successfully written to the spare ( or to parity when resyncing).
+ * To distingush these states we have a stripe bit STRIPE_INSYNC that
+ * is set whenever a write is scheduled to the spare, or to the parity
+ * disc if there is no spare.  A sync request clears this bit, and
+ * when we find it set with no buffers locked, we know the sync is
+ * complete.
+ *
+ * Buffers for the md device that arrive via make_request are attached
+ * to the appropriate stripe in one of two lists linked on b_reqnext.
+ * One list (bh_read) for read requests, one (bh_write) for write.
+ * There should never be more than one buffer on the two lists
+ * together, but we are not guaranteed of that so we allow for more.
+ *
+ * If a buffer is on the read list when the associated cache buffer is
+ * Uptodate, the data is copied into the read buffer and it's b_end_io
+ * routine is called.  This may happen in the end_request routine only
+ * if the buffer has just successfully been read.  end_request should
+ * remove the buffers from the list and then set the Uptodate bit on
+ * the buffer.  Other threads may do this only if they first check
+ * that the Uptodate bit is set.  Once they have checked that they may
+ * take buffers off the read queue.
+ *
+ * When a buffer on the write list is committed for write it is copied
+ * into the cache buffer, which is then marked dirty, and moved onto a
+ * third list, the written list (bh_written).  Once both the parity
+ * block and the cached buffer are successfully written, any buffer on
+ * a written list can be returned with b_end_io.
+ *
+ * The write list and read list both act as fifos.  The read list is
+ * protected by the device_lock.  The write and written lists are
+ * protected by the stripe lock.  The device_lock, which can be
+ * claimed while the stipe lock is held, is only for list
+ * manipulations and will only be held for a very short time.  It can
+ * be claimed from interrupts.
+ *
+ *
+ * Stripes in the stripe cache can be on one of two lists (or on
+ * neither).  The "inactive_list" contains stripes which are not
+ * currently being used for any request.  They can freely be reused
+ * for another stripe.  The "handle_list" contains stripes that need
+ * to be handled in some way.  Both of these are fifo queues.  Each
+ * stripe is also (potentially) linked to a hash bucket in the hash
+ * table so that it can be found by sector number.  Stripes that are
+ * not hashed must be on the inactive_list, and will normally be at
+ * the front.  All stripes start life this way.
+ *
+ * The inactive_list, handle_list and hash bucket lists are all protected by the
+ * device_lock.
+ *  - stripes on the inactive_list never have their stripe_lock held.
+ *  - stripes have a reference counter. If count==0, they are on a list.
+ *  - If a stripe might need handling, STRIPE_HANDLE is set.
+ *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
+ *    handle_list else inactive_list
+ *
+ * This, combined with the fact that STRIPE_HANDLE is only ever
+ * cleared while a stripe has a non-zero count means that if the
+ * refcount is 0 and STRIPE_HANDLE is set, then it is on the
+ * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
+ * the stripe is on inactive_list.
+ *
+ * The possible transitions are:
+ *  activate an unhashed/inactive stripe (get_active_stripe())
+ *     lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
+ *  activate a hashed, possibly active stripe (get_active_stripe())
+ *     lockdev check-hash if(!cnt++)unlink-stripe unlockdev
+ *  attach a request to an active stripe (add_stripe_bh())
+ *     lockdev attach-buffer unlockdev
+ *  handle a stripe (handle_stripe())
+ *     lockstripe clrSTRIPE_HANDLE ...
+ *              (lockdev check-buffers unlockdev) ..
+ *              change-state ..
+ *              record io/ops needed unlockstripe schedule io/ops
+ *  release an active stripe (release_stripe())
+ *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
+ *
+ * The refcount counts each thread that have activated the stripe,
+ * plus raid5d if it is handling it, plus one for each active request
+ * on a cached buffer, and plus one if the stripe is undergoing stripe
+ * operations.
+ *
+ * Stripe operations are performed outside the stripe lock,
+ * the stripe operations are:
+ * -copying data between the stripe cache and user application buffers
+ * -computing blocks to save a disk access, or to recover a missing block
+ * -updating the parity on a write operation (reconstruct write and
+ *  read-modify-write)
+ * -checking parity correctness
+ * -running i/o to disk
+ * These operations are carried out by raid5_run_ops which uses the async_tx
+ * api to (optionally) offload operations to dedicated hardware engines.
+ * When requesting an operation handle_stripe sets the pending bit for the
+ * operation and increments the count.  raid5_run_ops is then run whenever
+ * the count is non-zero.
+ * There are some critical dependencies between the operations that prevent some
+ * from being requested while another is in flight.
+ * 1/ Parity check operations destroy the in cache version of the parity block,
+ *    so we prevent parity dependent operations like writes and compute_blocks
+ *    from starting while a check is in progress.  Some dma engines can perform
+ *    the check without damaging the parity block, in these cases the parity
+ *    block is re-marked up to date (assuming the check was successful) and is
+ *    not re-read from disk.
+ * 2/ When a write operation is requested we immediately lock the affected
+ *    blocks, and mark them as not up to date.  This causes new read requests
+ *    to be held off, as well as parity checks and compute block operations.
+ * 3/ Once a compute block operation has been requested handle_stripe treats
+ *    that block as if it is up to date.  raid5_run_ops guaruntees that any
+ *    operation that is dependent on the compute block result is initiated after
+ *    the compute block completes.
+ */
+/*
+ * Operations state - intermediate states that are visible outside of sh->lock
+ * In general _idle indicates nothing is running, _run indicates a data
+ * processing operation is active, and _result means the data processing result
+ * is stable and can be acted upon.  For simple operations like biofill and
+ * compute that only have an _idle and _run state they are indicated with
+ * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
+ */
+/**
+ * enum check_states - handles syncing / repairing a stripe
+ * @check_state_idle - check operations are quiesced
+ * @check_state_run - check operation is running
+ * @check_state_result - set outside lock when check result is valid
+ * @check_state_compute_run - check failed and we are repairing
+ * @check_state_compute_result - set outside lock when compute result is valid
+ */
+enum check_states {
+        check_state_idle = 0,
+        check_state_run, /* parity check */
+        check_state_check_result,
+        check_state_compute_run, /* parity repair */
+        check_state_compute_result,
+};
+/**
+ * enum reconstruct_states - handles writing or expanding a stripe
+ */
+enum reconstruct_states {
+        reconstruct_state_idle = 0,
+        reconstruct_state_prexor_drain_run,     /* prexor-write */
+        reconstruct_state_drain_run,            /* write */
+        reconstruct_state_run,                  /* expand */
+        reconstruct_state_prexor_drain_result,
+        reconstruct_state_drain_result,
+        reconstruct_state_result,
+};
+struct stripe_head {
+        struct hlist_node       hash;
+        struct list_head        lru;          /* inactive_list or handle_list */
+        struct raid5_private_data *raid_conf;
+        short                   generation;     /* increments with every
+                                                 * reshape */
+        sector_t                sector;         /* sector of this row */
+        short                   pd_idx;         /* parity disk index */
+        short                   qd_idx;         /* 'Q' disk index for raid6 */
+        short                   ddf_layout;/* use DDF ordering to calculate Q */
+        unsigned long           state;          /* state flags */
+        atomic_t                count;        /* nr of active thread/requests */
+        spinlock_t              lock;
+        int                     bm_seq; /* sequence number for bitmap flushes */
+        int                     disks;          /* disks in stripe */
+        enum check_states       check_state;
+        enum reconstruct_states reconstruct_state;
+        /* stripe_operations
+         * @target - STRIPE_OP_COMPUTE_BLK target
+         */
+        struct stripe_operations {
+                int                target;
+                u32                zero_sum_result;
+        } ops;
+        struct r5dev {
+                struct bio      req;
+                struct bio_vec  vec;
+                struct page     *page;
+                struct bio      *toread, *read, *towrite, *written;
+                sector_t        sector;                 /* sector of this page */
+                unsigned long   flags;
+        } dev[1]; /* allocated with extra space depending of RAID geometry */
+};
+/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
+ *     for handle_stripe.  It is only valid under spin_lock(sh->lock);
+ */
+struct stripe_head_state {
+        int syncing, expanding, expanded;
+        int locked, uptodate, to_read, to_write, failed, written;
+        int to_fill, compute, req_compute, non_overwrite;
+        int failed_num;
+        unsigned long ops_request;
+};
+/* r6_state - extra state data only relevant to r6 */
+struct r6_state {
+        int p_failed, q_failed, failed_num[2];
+};
+/* Flags */
+#define R5_UPTODATE     0       /* page contains current data */
+#define R5_LOCKED       1       /* IO has been submitted on "req" */
+#define R5_OVERWRITE    2       /* towrite covers whole page */
+/* and some that are internal to handle_stripe */
+#define R5_Insync       3       /* rdev && rdev->in_sync at start */
+#define R5_Wantread     4       /* want to schedule a read */
+#define R5_Wantwrite    5
+#define R5_Overlap      7       /* There is a pending overlapping request on this block */
+#define R5_ReadError    8       /* seen a read error here recently */
+#define R5_ReWrite      9       /* have tried to over-write the readerror */
+#define R5_Expanded     10      /* This block now has post-expand data */
+#define R5_Wantcompute  11 /* compute_block in progress treat as
+                                    * uptodate
+                                    */
+#define R5_Wantfill     12 /* dev->toread contains a bio that needs
+                                    * filling
+                                    */
+#define R5_Wantdrain    13 /* dev->towrite needs to be drained */
+/*
+ * Write method
+ */
+#define RECONSTRUCT_WRITE       1
+#define READ_MODIFY_WRITE       2
+/* not a write method, but a compute_parity mode */
+#define CHECK_PARITY            3
+/* Additional compute_parity mode -- updates the parity w/o LOCKING */
+#define UPDATE_PARITY           4
+/*
+ * Stripe state
+ */
+#define STRIPE_HANDLE           2
+#define STRIPE_SYNCING          3
+#define STRIPE_INSYNC           4
+#define STRIPE_PREREAD_ACTIVE   5
+#define STRIPE_DELAYED          6
+#define STRIPE_DEGRADED         7
+#define STRIPE_BIT_DELAY        8
+#define STRIPE_EXPANDING        9
+#define STRIPE_EXPAND_SOURCE    10
+#define STRIPE_EXPAND_READY     11
+#define STRIPE_IO_STARTED       12 /* do not count towards 'bypass_count' */
+#define STRIPE_FULL_WRITE       13 /* all blocks are set to be overwritten */
+#define STRIPE_BIOFILL_RUN      14
+#define STRIPE_COMPUTE_RUN      15
+/*
+ * Operation request flags
+ */
+#define STRIPE_OP_BIOFILL       0
+#define STRIPE_OP_COMPUTE_BLK   1
+#define STRIPE_OP_PREXOR        2
+#define STRIPE_OP_BIODRAIN      3
+#define STRIPE_OP_POSTXOR       4
+#define STRIPE_OP_CHECK 5
+/*
+ * Plugging:
+ *
+ * To improve write throughput, we need to delay the handling of some
+ * stripes until there has been a chance that several write requests
+ * for the one stripe have all been collected.
+ * In particular, any write request that would require pre-reading
+ * is put on a "delayed" queue until there are no stripes currently
+ * in a pre-read phase.  Further, if the "delayed" queue is empty when
+ * a stripe is put on it then we "plug" the queue and do not process it
+ * until an unplug call is made. (the unplug_io_fn() is called).
+ *
+ * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
+ * it to the count of prereading stripes.
+ * When write is initiated, or the stripe refcnt == 0 (just in case) we
+ * clear the PREREAD_ACTIVE flag and decrement the count
+ * Whenever the 'handle' queue is empty and the device is not plugged, we
+ * move any strips from delayed to handle and clear the DELAYED flag and set
+ * PREREAD_ACTIVE.
+ * In stripe_handle, if we find pre-reading is necessary, we do it if
+ * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
+ * HANDLE gets cleared if stripe_handle leave nothing locked.
+ */
+struct disk_info {
+        mdk_rdev_t      *rdev;
+};
+struct raid5_private_data {
+        struct hlist_head       *stripe_hashtbl;
+        mddev_t                 *mddev;
+        struct disk_info        *spare;
+        int                     chunk_size, level, algorithm;
+        int                     max_degraded;
+        int                     raid_disks;
+        int                     max_nr_stripes;
+        /* reshape_progress is the leading edge of a 'reshape'
+         * It has value MaxSector when no reshape is happening
+         * If delta_disks < 0, it is the last sector we started work on,
+         * else is it the next sector to work on.
+         */
+        sector_t                reshape_progress;
+        /* reshape_safe is the trailing edge of a reshape.  We know that
+         * before (or after) this address, all reshape has completed.
+         */
+        sector_t                reshape_safe;
+        int                     previous_raid_disks;
+        int                     prev_chunk, prev_algo;
+        short                   generation; /* increments with every reshape */
+        unsigned long           reshape_checkpoint; /* Time we last updated
+                                                     * metadata */
+        struct list_head        handle_list; /* stripes needing handling */
+        struct list_head        hold_list; /* preread ready stripes */
+        struct list_head        delayed_list; /* stripes that have plugged requests */
+        struct list_head        bitmap_list; /* stripes delaying awaiting bitmap update */
+        struct bio              *retry_read_aligned; /* currently retrying aligned bios   */
+        struct bio              *retry_read_aligned_list; /* aligned bios retry list  */
+        atomic_t                preread_active_stripes; /* stripes with scheduled io */
+        atomic_t                active_aligned_reads;
+        atomic_t                pending_full_writes; /* full write backlog */
+        int                     bypass_count; /* bypassed prereads */
+        int                     bypass_threshold; /* preread nice */
+        struct list_head        *last_hold; /* detect hold_list promotions */
+        atomic_t                reshape_stripes; /* stripes with pending writes for reshape */
+        /* unfortunately we need two cache names as we temporarily have
+         * two caches.
+         */
+        int                     active_name;
+        char                    cache_name[2][20];
+        struct kmem_cache               *slab_cache; /* for allocating stripes */
+        int                     seq_flush, seq_write;
+        int                     quiesce;
+        int                     fullsync;  /* set to 1 if a full sync is needed,
+                                            * (fresh device added).
+                                            * Cleared when a sync completes.
+                                            */
+        struct page             *spare_page; /* Used when checking P/Q in raid6 */
+        /*
+         * Free stripes pool
+         */
+        atomic_t                active_stripes;
+        struct list_head        inactive_list;
+        wait_queue_head_t       wait_for_stripe;
+        wait_queue_head_t       wait_for_overlap;
+        int                     inactive_blocked;       /* release of inactive stripes blocked,
+                                                         * waiting for 25% to be free
+                                                         */
+        int                     pool_size; /* number of disks in stripeheads in pool */
+        spinlock_t              device_lock;
+        struct disk_info        *disks;
+        /* When taking over an array from a different personality, we store
+         * the new thread here until we fully activate the array.
+         */
+        struct mdk_thread_s     *thread;
+};
+typedef struct raid5_private_data raid5_conf_t;
+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
+/*
+ * Our supported algorithms
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC       0 /* Rotating Parity N with Data Restart */
+#define ALGORITHM_RIGHT_ASYMMETRIC      1 /* Rotating Parity 0 with Data Restart */
+#define ALGORITHM_LEFT_SYMMETRIC        2 /* Rotating Parity N with Data Continuation */
+#define ALGORITHM_RIGHT_SYMMETRIC       3 /* Rotating Parity 0 with Data Continuation */
+/* Define non-rotating (raid4) algorithms.  These allow
+ * conversion of raid4 to raid5.
+ */
+#define ALGORITHM_PARITY_0              4 /* P or P,Q are initial devices */
+#define ALGORITHM_PARITY_N              5 /* P or P,Q are final devices. */
+/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
+ * Firstly, the exact positioning of the parity block is slightly
+ * different between the 'LEFT_*' modes of md and the "_N_*" modes
+ * of DDF.
+ * Secondly, or order of datablocks over which the Q syndrome is computed
+ * is different.
+ * Consequently we have different layouts for DDF/raid6 than md/raid6.
+ * These layouts are from the DDFv1.2 spec.
+ * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
+ * leaves RLQ=3 as 'Vendor Specific'
+ */
+#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */
+#define ALGORITHM_ROTATING_N_RESTART    9 /* DDF PRL=6 RLQ=2 */
+#define ALGORITHM_ROTATING_N_CONTINUE   10 /*DDF PRL=6 RLQ=3 */
+/* For every RAID5 algorithm we define a RAID6 algorithm
+ * with exactly the same layout for data and parity, and
+ * with the Q block always on the last device (N-1).
+ * This allows trivial conversion from RAID5 to RAID6
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC_6     16
+#define ALGORITHM_RIGHT_ASYMMETRIC_6    17
+#define ALGORITHM_LEFT_SYMMETRIC_6      18
+#define ALGORITHM_RIGHT_SYMMETRIC_6     19
+#define ALGORITHM_PARITY_0_6            20
+#define ALGORITHM_PARITY_N_6            ALGORITHM_PARITY_N
+static inline int algorithm_valid_raid5(int layout)
+{
+        return (layout >= 0) &&
+                (layout <= 5);
+}
+static inline int algorithm_valid_raid6(int layout)
+{
+        return (layout >= 0 && layout <= 5)
+                ||
+                (layout == 8 || layout == 10)
+                ||
+                (layout >= 16 && layout <= 20);
+}
+static inline int algorithm_is_DDF(int layout)
+{
+        return layout >= 8 && layout <= 10;
+}
+#endif
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h
deleted file mode 100644
index 98dcde88470e..000000000000
--- a/drivers/md/raid6.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- *   Copyright 2003 H. Peter Anvin - All Rights Reserved
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
- *   (at your option) any later version; incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-#ifndef LINUX_RAID_RAID6_H
-#define LINUX_RAID_RAID6_H
-#ifdef __KERNEL__
-/* Set to 1 to use kernel-wide empty_zero_page */
-#define RAID6_USE_EMPTY_ZERO_PAGE 0
-#include <linux/raid/md.h>
-#include <linux/raid/raid5.h>
-typedef raid5_conf_t raid6_conf_t; /* Same configuration */
-/* Additional compute_parity mode -- updates the parity w/o LOCKING */
-#define UPDATE_PARITY   4
-/* We need a pre-zeroed page... if we don't want to use the kernel-provided
-   one define it here */
-#if RAID6_USE_EMPTY_ZERO_PAGE
-# define raid6_empty_zero_page empty_zero_page
-#else
-extern const char raid6_empty_zero_page[PAGE_SIZE];
-#endif
-#else /* ! __KERNEL__ */
-/* Used for testing in user space */
-#include <errno.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <stddef.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-/* Not standard, but glibc defines it */
-#define BITS_PER_LONG __WORDSIZE
-typedef uint8_t  u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-#ifndef PAGE_SIZE
-# define PAGE_SIZE 4096
-#endif
-extern const char raid6_empty_zero_page[PAGE_SIZE];
-#define __init
-#define __exit
-#define __attribute_const__ __attribute__((const))
-#define noinline __attribute__((noinline))
-#define preempt_enable()
-#define preempt_disable()
-#define cpu_has_feature(x) 1
-#define enable_kernel_altivec()
-#define disable_kernel_altivec()
-#endif /* __KERNEL__ */
-/* Routine choices */
-struct raid6_calls {
-        void (*gen_syndrome)(int, size_t, void **);
-        int  (*valid)(void);    /* Returns 1 if this routine set is usable */
-        const char *name;       /* Name of this routine set */
-        int prefer;             /* Has special performance attribute */
-};
-/* Selected algorithm */
-extern struct raid6_calls raid6_call;
-/* Algorithm list */
-extern const struct raid6_calls * const raid6_algos[];
-int raid6_select_algo(void);
-/* Return values from chk_syndrome */
-#define RAID6_OK        0
-#define RAID6_P_BAD     1
-#define RAID6_Q_BAD     2
-#define RAID6_PQ_BAD    3
-/* Galois field tables */
-extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
-extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
-extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
-extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
-/* Recovery routines */
-void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
-void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
-void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
-/* Some definitions to allow code to be compiled for testing in userspace */
-#ifndef __KERNEL__
-# define jiffies        raid6_jiffies()
-# define printk         printf
-# define GFP_KERNEL     0
-# define __get_free_pages(x,y)  ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0))
-# define free_pages(x,y)        munmap((void *)(x), (y)*PAGE_SIZE)
-static inline void cpu_relax(void)
-{
-        /* Nothing */
-}
-#undef  HZ
-#define HZ 1000
-static inline uint32_t raid6_jiffies(void)
-{
-        struct timeval tv;
-        gettimeofday(&tv, NULL);
-        return tv.tv_sec*1000 + tv.tv_usec/1000;
-}
-#endif /* ! __KERNEL__ */
-#endif /* LINUX_RAID_RAID6_H */
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
index 21987e3dbe6c..866215ac7f25 100644
--- a/drivers/md/raid6algos.c
+++ b/drivers/md/raid6algos.c
@@ -5,7 +5,7 @@
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
 *   (at your option) any later version; incorporated herein by reference.
 *
 * ----------------------------------------------------------------------- */
@@ -16,13 +16,20 @@
 * Algorithm list and algorithm selection for RAID-6
 */
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #ifndef __KERNEL__
 #include <sys/mman.h>
 #include <stdio.h>
+#else
+#if !RAID6_USE_EMPTY_ZERO_PAGE
+/* In .bss so it's zeroed */
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+EXPORT_SYMBOL(raid6_empty_zero_page);
+#endif
 #endif
 struct raid6_calls raid6_call;
+EXPORT_SYMBOL_GPL(raid6_call);
 /* Various routine sets */
 extern const struct raid6_calls raid6_intx1;
@@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = {
 #else
 /* Need more time to be stable in userspace */
 #define RAID6_TIME_JIFFIES_LG2  9
+#define time_before(x, y) ((x) < (y))
 #endif
 /* Try to pick the best algorithm */
@@ -152,3 +160,12 @@ int __init raid6_select_algo(void)
        return best ? 0 : -EINVAL;
 }
+static void raid6_exit(void)
+{
+        do { } while (0);
+}
+subsys_initcall(raid6_select_algo);
+module_exit(raid6_exit);
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc
index b9afd35b8812..699dfeee4944 100644
--- a/drivers/md/raid6altivec.uc
+++ b/drivers/md/raid6altivec.uc
@@ -5,7 +5,7 @@
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
 *   (at your option) any later version; incorporated herein by reference.
 *
 * ----------------------------------------------------------------------- */
@@ -22,7 +22,7 @@
 * bracked this with preempt_disable/enable or in a lock)
 */
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #ifdef CONFIG_ALTIVEC
diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc
index ad004cee0e26..f9bf9cba357f 100644
--- a/drivers/md/raid6int.uc
+++ b/drivers/md/raid6int.uc
@@ -5,7 +5,7 @@
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
 *   (at your option) any later version; incorporated herein by reference.
 *
 * ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
 * This file is postprocessed using unroll.pl
 */
-#include "raid6.h"
+#include <linux/raid/pq.h>
 /*
 * This is the C data type to use
diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c
index d4e4a1bd70ad..e7f6c13132bf 100644
--- a/drivers/md/raid6mmx.c
+++ b/drivers/md/raid6mmx.c
@@ -5,7 +5,7 @@
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
 *   (at your option) any later version; incorporated herein by reference.
 *
 * ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
 #if defined(__i386__) && !defined(__arch_um__)
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 /* Shared with raid6sse1.c */
diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c
index a8c4d9451bd9..2609f00e0d61 100644
--- a/drivers/md/raid6recov.c
+++ b/drivers/md/raid6recov.c
@@ -5,7 +5,7 @@
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
 *   (at your option) any later version; incorporated herein by reference.
 *
 * ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
 * the syndrome.)
 */
-#include "raid6.h"
+#include <linux/raid/pq.h>
 /* Recover two failed data blocks. */
 void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
@@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
                p++; q++;
        }
 }
+EXPORT_SYMBOL_GPL(raid6_2data_recov);
 /* Recover failure of one data block plus the P block */
 void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
@@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
                q++; dq++;
        }
 }
+EXPORT_SYMBOL_GPL(raid6_datap_recov);
+#ifndef __KERNEL__
-#ifndef __KERNEL__              /* Testing only */
+/* Testing only */
 /* Recover two failed blocks. */
 void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c
index 0666237276ff..b274dd5eab8f 100644
--- a/drivers/md/raid6sse1.c
+++ b/drivers/md/raid6sse1.c
@@ -5,7 +5,7 @@
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
 *   (at your option) any later version; incorporated herein by reference.
 *
 * ----------------------------------------------------------------------- */
@@ -23,7 +23,7 @@
 #if defined(__i386__) && !defined(__arch_um__)
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 /* Defined in raid6mmx.c */
diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c
index b034ad868039..6ed6c6c0389f 100644
--- a/drivers/md/raid6sse2.c
+++ b/drivers/md/raid6sse2.c
@@ -5,7 +5,7 @@
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
 *   (at your option) any later version; incorporated herein by reference.
 *
 * ----------------------------------------------------------------------- */
@@ -19,7 +19,7 @@
 #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 static const struct raid6_sse_constants {
diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile
index 78e0396adf2a..58ffdf4f5161 100644
--- a/drivers/md/raid6test/Makefile
+++ b/drivers/md/raid6test/Makefile
@@ -5,7 +5,7 @@
 CC       = gcc
 OPTFLAGS = -O2                  # Adjust as desired
-CFLAGS   = -I.. -g $(OPTFLAGS)
+CFLAGS   = -I.. -I ../../../include -g $(OPTFLAGS)
 LD       = ld
 PERL     = perl
 AR       = ar
diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c
index 559cc41b2585..7a930318b17d 100644
--- a/drivers/md/raid6test/test.c
+++ b/drivers/md/raid6test/test.c
@@ -17,7 +17,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #define NDISKS          16      /* Including P and Q */
diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h
index 99fea7a70ca7..4c22c1568558 100644
--- a/drivers/md/raid6x86.h
+++ b/drivers/md/raid6x86.h
@@ -5,7 +5,7 @@
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
 *   (at your option) any later version; incorporated herein by reference.
 *
 * ----------------------------------------------------------------------- */