26 files changed, 2758 insertions, 3752 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ac25a48362ac..bf869ed03eed 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -90,7 +90,7 @@ config MD_RAID10
        depends on BLK_DEV_MD && EXPERIMENTAL
        ---help---
          RAID-10 provides a combination of striping (RAID-0) and
-          mirroring (RAID-1) with easier configuration and more flexable
+          mirroring (RAID-1) with easier configuration and more flexible
          layout.
          Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to
          be the same size (or at least, only as much as the smallest device
@@ -104,8 +104,8 @@ config MD_RAID10
          If unsure, say Y.
-config MD_RAID5
+config MD_RAID456
-        tristate "RAID-4/RAID-5 mode"
+        tristate "RAID-4/RAID-5/RAID-6 mode"
        depends on BLK_DEV_MD
        ---help---
          A RAID-5 set of N drives with a capacity of C MB per drive provides
@@ -116,20 +116,28 @@ config MD_RAID5
          while a RAID-5 set distributes the parity across the drives in one
          of the available parity distribution methods.
+          A RAID-6 set of N drives with a capacity of C MB per drive
+          provides the capacity of C * (N - 2) MB, and protects
+          against a failure of any two drives. For a given sector
+          (row) number, (N - 2) drives contain data sectors, and two
+          drives contains two independent redundancy syndromes.  Like
+          RAID-5, RAID-6 distributes the syndromes across the drives
+          in one of the available parity distribution methods.
          Information about Software RAID on Linux is contained in the
          Software-RAID mini-HOWTO, available from
          <http://www.tldp.org/docs.html#howto>. There you will also
          learn where to get the supporting user space utilities raidtools.
-          If you want to use such a RAID-4/RAID-5 set, say Y.  To
+          If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y.  To
          compile this code as a module, choose M here: the module
-          will be called raid5.
+          will be called raid456.
          If unsure, say Y.
 config MD_RAID5_RESHAPE
        bool "Support adding drives to a raid-5 array (experimental)"
-        depends on MD_RAID5 && EXPERIMENTAL
+        depends on MD_RAID456 && EXPERIMENTAL
        ---help---
          A RAID-5 set can be expanded by adding extra drives. This
          requires "restriping" the array which means (almost) every
@@ -139,7 +147,7 @@ config MD_RAID5_RESHAPE
          is online.  However it is still EXPERIMENTAL code.  It should
          work, but please be sure that you have backups.
-          You will need mdadm verion 2.4.1 or later to use this
+          You will need mdadm version 2.4.1 or later to use this
          feature safely.  During the early stage of reshape there is
          a critical section where live data is being over-written.  A
          crash during this time needs extra care for recovery.  The
@@ -154,28 +162,6 @@ config MD_RAID5_RESHAPE
          There should be enough spares already present to make the new
          array workable.
-config MD_RAID6
-        tristate "RAID-6 mode"
-        depends on BLK_DEV_MD
-        ---help---
-          A RAID-6 set of N drives with a capacity of C MB per drive
-          provides the capacity of C * (N - 2) MB, and protects
-          against a failure of any two drives. For a given sector
-          (row) number, (N - 2) drives contain data sectors, and two
-          drives contains two independent redundancy syndromes.  Like
-          RAID-5, RAID-6 distributes the syndromes across the drives
-          in one of the available parity distribution methods.
-          RAID-6 requires mdadm-1.5.0 or later, available at:
-          ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
-          If you want to use such a RAID-6 set, say Y.  To compile
-          this code as a module, choose M here: the module will be
-          called raid6.
-          If unsure, say Y.
 config MD_MULTIPATH
        tristate "Multipath I/O support"
        depends on BLK_DEV_MD
@@ -235,7 +221,7 @@ config DM_SNAPSHOT
       tristate "Snapshot target (EXPERIMENTAL)"
       depends on BLK_DEV_DM && EXPERIMENTAL
       ---help---
-         Allow volume managers to take writeable snapshots of a device.
+         Allow volume managers to take writable snapshots of a device.
 config DM_MIRROR
       tristate "Mirror target (EXPERIMENTAL)"
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d3efedf6a6ad..34957a68d921 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,7 +8,7 @@ dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
 dm-snapshot-objs := dm-snap.o dm-exception-store.o
 dm-mirror-objs  := dm-log.o dm-raid1.o
 md-mod-objs     := md.o bitmap.o
-raid6-objs      := raid6main.o raid6algos.o raid6recov.o raid6tables.o \
+raid456-objs    := raid5.o raid6algos.o raid6recov.o raid6tables.o \
                   raid6int1.o raid6int2.o raid6int4.o \
                   raid6int8.o raid6int16.o raid6int32.o \
                   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
@@ -25,8 +25,7 @@ obj-$(CONFIG_MD_LINEAR)		+= linear.o
 obj-$(CONFIG_MD_RAID0)          += raid0.o
 obj-$(CONFIG_MD_RAID1)          += raid1.o
 obj-$(CONFIG_MD_RAID10)         += raid10.o
-obj-$(CONFIG_MD_RAID5)          += raid5.o xor.o
+obj-$(CONFIG_MD_RAID456)        += raid456.o xor.o
-obj-$(CONFIG_MD_RAID6)          += raid6.o xor.o
 obj-$(CONFIG_MD_MULTIPATH)      += multipath.o
 obj-$(CONFIG_MD_FAULTY)         += faulty.o
 obj-$(CONFIG_BLK_DEV_MD)        += md-mod.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index f8ffaee20ff8..ecc56765d949 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -7,7 +7,6 @@
 * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
 * - added disk storage for bitmap
 * - changes to allow various bitmap chunk sizes
- * - added bitmap daemon (to asynchronously clear bitmap bits from disk)
 */
 /*
@@ -15,16 +14,12 @@
 *
 * flush after percent set rather than just time based. (maybe both).
 * wait if count gets too high, wake when it drops to half.
- * allow bitmap to be mirrored with superblock (before or after...)
- * allow hot-add to re-instate a current device.
- * allow hot-add of bitmap after quiessing device
 */
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/init.h>
-#include <linux/config.h>
 #include <linux/timer.h>
 #include <linux/sched.h>
 #include <linux/list.h>
@@ -73,24 +68,6 @@ static inline char * bmname(struct bitmap *bitmap)
 /*
- * test if the bitmap is active
- */
-int bitmap_active(struct bitmap *bitmap)
-{
-        unsigned long flags;
-        int res = 0;
-        if (!bitmap)
-                return res;
-        spin_lock_irqsave(&bitmap->lock, flags);
-        res = bitmap->flags & BITMAP_ACTIVE;
-        spin_unlock_irqrestore(&bitmap->lock, flags);
-        return res;
-}
-#define WRITE_POOL_SIZE 256
-/*
 * just a placeholder - calls kmalloc for bitmap pages
 */
 static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
@@ -269,6 +246,8 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
                if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) {
                        page->index = index;
+                        attach_page_buffers(page, NULL); /* so that free_buffer will
+                                                          * quietly no-op */
                        return page;
                }
        }
@@ -300,77 +279,132 @@ static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wai
 */
 static int write_page(struct bitmap *bitmap, struct page *page, int wait)
 {
-        int ret = -ENOMEM;
+        struct buffer_head *bh;
        if (bitmap->file == NULL)
                return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
-        flush_dcache_page(page); /* make sure visible to anyone reading the file */
+        bh = page_buffers(page);
-        if (wait)
+        while (bh && bh->b_blocknr) {
-                lock_page(page);
+                atomic_inc(&bitmap->pending_writes);
-        else {
+                set_buffer_locked(bh);
-                if (TestSetPageLocked(page))
+                set_buffer_mapped(bh);
-                        return -EAGAIN; /* already locked */
+                submit_bh(WRITE, bh);
-                if (PageWriteback(page)) {
+                bh = bh->b_this_page;
-                        unlock_page(page);
-                        return -EAGAIN;
-                }
        }
-        ret = page->mapping->a_ops->prepare_write(bitmap->file, page, 0, PAGE_SIZE);
+        if (wait) {
-        if (!ret)
+                wait_event(bitmap->write_wait,
-                ret = page->mapping->a_ops->commit_write(bitmap->file, page, 0,
+                           atomic_read(&bitmap->pending_writes)==0);
-                        PAGE_SIZE);
+                return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0;
-        if (ret) {
-                unlock_page(page);
-                return ret;
        }
+        return 0;
+}
-        set_page_dirty(page); /* force it to be written out */
+static void end_bitmap_write(struct buffer_head *bh, int uptodate)
+{
-        if (!wait) {
+        struct bitmap *bitmap = bh->b_private;
-                /* add to list to be waited for by daemon */
+        unsigned long flags;
-                struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO);
-                item->page = page;
+        if (!uptodate) {
-                get_page(page);
+                spin_lock_irqsave(&bitmap->lock, flags);
-                spin_lock(&bitmap->write_lock);
+                bitmap->flags |= BITMAP_WRITE_ERROR;
-                list_add(&item->list, &bitmap->complete_pages);
+                spin_unlock_irqrestore(&bitmap->lock, flags);
-                spin_unlock(&bitmap->write_lock);
+        }
-                md_wakeup_thread(bitmap->writeback_daemon);
+        if (atomic_dec_and_test(&bitmap->pending_writes))
+                wake_up(&bitmap->write_wait);
+}
+/* copied from buffer.c */
+static void
+__clear_page_buffers(struct page *page)
+{
+        ClearPagePrivate(page);
+        set_page_private(page, 0);
+        page_cache_release(page);
+}
+static void free_buffers(struct page *page)
+{
+        struct buffer_head *bh = page_buffers(page);
+        while (bh) {
+                struct buffer_head *next = bh->b_this_page;
+                free_buffer_head(bh);
+                bh = next;
        }
-        return write_one_page(page, wait);
+        __clear_page_buffers(page);
+        put_page(page);
 }
-/* read a page from a file, pinning it into cache, and return bytes_read */
+/* read a page from a file.
+ * We both read the page, and attach buffers to the page to record the
+ * address of each block (using bmap).  These addresses will be used
+ * to write the block later, completely bypassing the filesystem.
+ * This usage is similar to how swap files are handled, and allows us
+ * to write to a file with no concerns of memory allocation failing.
+ */
 static struct page *read_page(struct file *file, unsigned long index,
-                                        unsigned long *bytes_read)
+                              struct bitmap *bitmap,
+                              unsigned long count)
 {
-        struct inode *inode = file->f_mapping->host;
        struct page *page = NULL;
-        loff_t isize = i_size_read(inode);
+        struct inode *inode = file->f_dentry->d_inode;
-        unsigned long end_index = isize >> PAGE_SHIFT;
+        struct buffer_head *bh;
+        sector_t block;
        PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE,
                        (unsigned long long)index << PAGE_SHIFT);
-        page = read_cache_page(inode->i_mapping, index,
+        page = alloc_page(GFP_KERNEL);
-                        (filler_t *)inode->i_mapping->a_ops->readpage, file);
+        if (!page)
+                page = ERR_PTR(-ENOMEM);
        if (IS_ERR(page))
                goto out;
-        wait_on_page_locked(page);
-        if (!PageUptodate(page) || PageError(page)) {
+        bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
+        if (!bh) {
                put_page(page);
-                page = ERR_PTR(-EIO);
+                page = ERR_PTR(-ENOMEM);
                goto out;
        }
+        attach_page_buffers(page, bh);
+        block = index << (PAGE_SHIFT - inode->i_blkbits);
+        while (bh) {
+                if (count == 0)
+                        bh->b_blocknr = 0;
+                else {
+                        bh->b_blocknr = bmap(inode, block);
+                        if (bh->b_blocknr == 0) {
+                                /* Cannot use this file! */
+                                free_buffers(page);
+                                page = ERR_PTR(-EINVAL);
+                                goto out;
+                        }
+                        bh->b_bdev = inode->i_sb->s_bdev;
+                        if (count < (1<<inode->i_blkbits))
+                                count = 0;
+                        else
+                                count -= (1<<inode->i_blkbits);
+                        bh->b_end_io = end_bitmap_write;
+                        bh->b_private = bitmap;
+                        atomic_inc(&bitmap->pending_writes);
+                        set_buffer_locked(bh);
+                        set_buffer_mapped(bh);
+                        submit_bh(READ, bh);
+                }
+                block++;
+                bh = bh->b_this_page;
+        }
+        page->index = index;
-        if (index > end_index) /* we have read beyond EOF */
+        wait_event(bitmap->write_wait,
-                *bytes_read = 0;
+                   atomic_read(&bitmap->pending_writes)==0);
-        else if (index == end_index) /* possible short read */
+        if (bitmap->flags & BITMAP_WRITE_ERROR) {
-                *bytes_read = isize & ~PAGE_MASK;
+                free_buffers(page);
-        else
+                page = ERR_PTR(-EIO);
-                *bytes_read = PAGE_SIZE; /* got a full page */
+        }
 out:
        if (IS_ERR(page))
                printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n",
@@ -441,16 +475,14 @@ static int bitmap_read_sb(struct bitmap *bitmap)
        char *reason = NULL;
        bitmap_super_t *sb;
        unsigned long chunksize, daemon_sleep, write_behind;
-        unsigned long bytes_read;
        unsigned long long events;
        int err = -EINVAL;
        /* page 0 is the superblock, read it... */
        if (bitmap->file)
-                bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read);
+                bitmap->sb_page = read_page(bitmap->file, 0, bitmap, PAGE_SIZE);
        else {
                bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0);
-                bytes_read = PAGE_SIZE;
        }
        if (IS_ERR(bitmap->sb_page)) {
                err = PTR_ERR(bitmap->sb_page);
@@ -460,13 +492,6 @@ static int bitmap_read_sb(struct bitmap *bitmap)
        sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
-        if (bytes_read < sizeof(*sb)) { /* short read */
-                printk(KERN_INFO "%s: bitmap file superblock truncated\n",
-                        bmname(bitmap));
-                err = -ENOSPC;
-                goto out;
-        }
        chunksize = le32_to_cpu(sb->chunksize);
        daemon_sleep = le32_to_cpu(sb->daemon_sleep);
        write_behind = le32_to_cpu(sb->write_behind);
@@ -550,7 +575,6 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
                spin_unlock_irqrestore(&bitmap->lock, flags);
                return;
        }
-        get_page(bitmap->sb_page);
        spin_unlock_irqrestore(&bitmap->lock, flags);
        sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
        switch (op) {
@@ -561,7 +585,6 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
                default: BUG();
        }
        kunmap_atomic(sb, KM_USER0);
-        put_page(bitmap->sb_page);
 }
 /*
@@ -614,48 +637,17 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
        while (pages--)
                if (map[pages]->index != 0) /* 0 is sb_page, release it below */
-                        put_page(map[pages]);
+                        free_buffers(map[pages]);
        kfree(map);
        kfree(attr);
-        safe_put_page(sb_page);
+        if (sb_page)
-}
+                free_buffers(sb_page);
-static void bitmap_stop_daemon(struct bitmap *bitmap);
-/* dequeue the next item in a page list -- don't call from irq context */
-static struct page_list *dequeue_page(struct bitmap *bitmap)
-{
-        struct page_list *item = NULL;
-        struct list_head *head = &bitmap->complete_pages;
-        spin_lock(&bitmap->write_lock);
-        if (list_empty(head))
-                goto out;
-        item = list_entry(head->prev, struct page_list, list);
-        list_del(head->prev);
-out:
-        spin_unlock(&bitmap->write_lock);
-        return item;
-}
-static void drain_write_queues(struct bitmap *bitmap)
-{
-        struct page_list *item;
-        while ((item = dequeue_page(bitmap))) {
-                /* don't bother to wait */
-                put_page(item->page);
-                mempool_free(item, bitmap->write_pool);
-        }
-        wake_up(&bitmap->write_wait);
 }
 static void bitmap_file_put(struct bitmap *bitmap)
 {
        struct file *file;
-        struct inode *inode;
        unsigned long flags;
        spin_lock_irqsave(&bitmap->lock, flags);
@@ -663,17 +655,14 @@ static void bitmap_file_put(struct bitmap *bitmap)
        bitmap->file = NULL;
        spin_unlock_irqrestore(&bitmap->lock, flags);
-        bitmap_stop_daemon(bitmap);
+        if (file)
+                wait_event(bitmap->write_wait,
-        drain_write_queues(bitmap);
+                           atomic_read(&bitmap->pending_writes)==0);
        bitmap_file_unmap(bitmap);
        if (file) {
-                inode = file->f_mapping->host;
+                struct inode *inode = file->f_dentry->d_inode;
-                spin_lock(&inode->i_lock);
+                invalidate_inode_pages(inode->i_mapping);
-                atomic_set(&inode->i_writecount, 1); /* allow writes again */
-                spin_unlock(&inode->i_lock);
                fput(file);
        }
 }
@@ -708,26 +697,27 @@ static void bitmap_file_kick(struct bitmap *bitmap)
 }
 enum bitmap_page_attr {
-        BITMAP_PAGE_DIRTY = 1, // there are set bits that need to be synced
+        BITMAP_PAGE_DIRTY = 0, // there are set bits that need to be synced
-        BITMAP_PAGE_CLEAN = 2, // there are bits that might need to be cleared
+        BITMAP_PAGE_CLEAN = 1, // there are bits that might need to be cleared
-        BITMAP_PAGE_NEEDWRITE=4, // there are cleared bits that need to be synced
+        BITMAP_PAGE_NEEDWRITE=2, // there are cleared bits that need to be synced
 };
 static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
                                enum bitmap_page_attr attr)
 {
-        bitmap->filemap_attr[page->index] |= attr;
+        __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
 }
 static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
                                enum bitmap_page_attr attr)
 {
-        bitmap->filemap_attr[page->index] &= ~attr;
+        __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
 }
-static inline unsigned long get_page_attr(struct bitmap *bitmap, struct page *page)
+static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
+                                           enum bitmap_page_attr attr)
 {
-        return bitmap->filemap_attr[page->index];
+        return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
 }
 /*
@@ -751,11 +741,6 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
        page = filemap_get_page(bitmap, chunk);
        bit = file_page_offset(chunk);
-        /* make sure the page stays cached until it gets written out */
-        if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY))
-                get_page(page);
        /* set the bit */
        kaddr = kmap_atomic(page, KM_USER0);
        if (bitmap->flags & BITMAP_HOSTENDIAN)
@@ -775,7 +760,8 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 * sync the dirty pages of the bitmap file to disk */
 int bitmap_unplug(struct bitmap *bitmap)
 {
-        unsigned long i, attr, flags;
+        unsigned long i, flags;
+        int dirty, need_write;
        struct page *page;
        int wait = 0;
        int err;
@@ -792,35 +778,26 @@ int bitmap_unplug(struct bitmap *bitmap)
                        return 0;
                }
                page = bitmap->filemap[i];
-                attr = get_page_attr(bitmap, page);
+                dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
+                need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
                clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
                clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
-                if ((attr & BITMAP_PAGE_DIRTY))
+                if (dirty)
                        wait = 1;
                spin_unlock_irqrestore(&bitmap->lock, flags);
-                if (attr & (BITMAP_PAGE_DIRTY | BITMAP_PAGE_NEEDWRITE)) {
+                if (dirty | need_write)
                        err = write_page(bitmap, page, 0);
-                        if (err == -EAGAIN) {
-                                if (attr & BITMAP_PAGE_DIRTY)
-                                        err = write_page(bitmap, page, 1);
-                                else
-                                        err = 0;
-                        }
-                        if (err)
-                                return 1;
-                }
        }
        if (wait) { /* if any writes were performed, we need to wait on them */
-                if (bitmap->file) {
+                if (bitmap->file)
-                        spin_lock_irq(&bitmap->write_lock);
+                        wait_event(bitmap->write_wait,
-                        wait_event_lock_irq(bitmap->write_wait,
+                                   atomic_read(&bitmap->pending_writes)==0);
-                                            list_empty(&bitmap->complete_pages), bitmap->write_lock,
+                else
-                                            wake_up_process(bitmap->writeback_daemon->tsk));
-                        spin_unlock_irq(&bitmap->write_lock);
-                } else
                        md_super_wait(bitmap->mddev);
        }
+        if (bitmap->flags & BITMAP_WRITE_ERROR)
+                bitmap_file_kick(bitmap);
        return 0;
 }
@@ -842,7 +819,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
        struct page *page = NULL, *oldpage = NULL;
        unsigned long num_pages, bit_cnt = 0;
        struct file *file;
-        unsigned long bytes, offset, dummy;
+        unsigned long bytes, offset;
        int outofdate;
        int ret = -ENOSPC;
        void *paddr;
@@ -879,7 +856,12 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
        if (!bitmap->filemap)
                goto out;
-        bitmap->filemap_attr = kzalloc(sizeof(long) * num_pages, GFP_KERNEL);
+        /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */
+        bitmap->filemap_attr = kzalloc(
+                (((num_pages*4/8)+sizeof(unsigned long)-1)
+                 /sizeof(unsigned long))
+                *sizeof(unsigned long),
+                GFP_KERNEL);
        if (!bitmap->filemap_attr)
                goto out;
@@ -890,7 +872,12 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
                index = file_page_index(i);
                bit = file_page_offset(i);
                if (index != oldindex) { /* this is a new page, read it in */
+                        int count;
                        /* unmap the old page, we're done with it */
+                        if (index == num_pages-1)
+                                count = bytes - index * PAGE_SIZE;
+                        else
+                                count = PAGE_SIZE;
                        if (index == 0) {
                                /*
                                 * if we're here then the superblock page
@@ -900,7 +887,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
                                page = bitmap->sb_page;
                                offset = sizeof(bitmap_super_t);
                        } else if (file) {
-                                page = read_page(file, index, &dummy);
+                                page = read_page(file, index, bitmap, count);
                                offset = 0;
                        } else {
                                page = read_sb_page(bitmap->mddev, bitmap->offset, index);
@@ -971,12 +958,11 @@ void bitmap_write_all(struct bitmap *bitmap)
        /* We don't actually write all bitmap blocks here,
         * just flag them as needing to be written
         */
+        int i;
-        unsigned long chunks = bitmap->chunks;
+        for (i=0; i < bitmap->file_pages; i++)
-        unsigned long bytes = (chunks+7)/8 + sizeof(bitmap_super_t);
+                set_page_attr(bitmap, bitmap->filemap[i],
-        unsigned long num_pages = (bytes + PAGE_SIZE-1) / PAGE_SIZE;
+                              BITMAP_PAGE_NEEDWRITE);
-        while (num_pages--)
-                bitmap->filemap_attr[num_pages] |= BITMAP_PAGE_NEEDWRITE;
 }
@@ -1007,7 +993,6 @@ int bitmap_daemon_work(struct bitmap *bitmap)
        struct page *page = NULL, *lastpage = NULL;
        int err = 0;
        int blocks;
-        int attr;
        void *paddr;
        if (bitmap == NULL)
@@ -1029,43 +1014,34 @@ int bitmap_daemon_work(struct bitmap *bitmap)
                if (page != lastpage) {
                        /* skip this page unless it's marked as needing cleaning */
-                        if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) {
+                        if (!test_page_attr(bitmap, page, BITMAP_PAGE_CLEAN)) {
-                                if (attr & BITMAP_PAGE_NEEDWRITE) {
+                                int need_write = test_page_attr(bitmap, page,
-                                        get_page(page);
+                                                                BITMAP_PAGE_NEEDWRITE);
+                                if (need_write)
                                        clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
-                                }
                                spin_unlock_irqrestore(&bitmap->lock, flags);
-                                if (attr & BITMAP_PAGE_NEEDWRITE) {
+                                if (need_write) {
                                        switch (write_page(bitmap, page, 0)) {
-                                        case -EAGAIN:
-                                                set_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
-                                                break;
                                        case 0:
                                                break;
                                        default:
                                                bitmap_file_kick(bitmap);
                                        }
-                                        put_page(page);
                                }
                                continue;
                        }
                        /* grab the new page, sync and release the old */
-                        get_page(page);
                        if (lastpage != NULL) {
-                                if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) {
+                                if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
                                        clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
                                        spin_unlock_irqrestore(&bitmap->lock, flags);
                                        err = write_page(bitmap, lastpage, 0);
-                                        if (err == -EAGAIN) {
-                                                err = 0;
-                                                set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
-                                        }
                                } else {
                                        set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
                                        spin_unlock_irqrestore(&bitmap->lock, flags);
                                }
-                                put_page(lastpage);
                                if (err)
                                        bitmap_file_kick(bitmap);
                        } else
@@ -1107,131 +1083,19 @@ int bitmap_daemon_work(struct bitmap *bitmap)
        /* now sync the final page */
        if (lastpage != NULL) {
                spin_lock_irqsave(&bitmap->lock, flags);
-                if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) {
+                if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
                        clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
                        spin_unlock_irqrestore(&bitmap->lock, flags);
                        err = write_page(bitmap, lastpage, 0);
-                        if (err == -EAGAIN) {
-                                set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
-                                err = 0;
-                        }
                } else {
                        set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
                        spin_unlock_irqrestore(&bitmap->lock, flags);
                }
-                put_page(lastpage);
        }
        return err;
 }
-static void daemon_exit(struct bitmap *bitmap, mdk_thread_t **daemon)
-{
-        mdk_thread_t *dmn;
-        unsigned long flags;
-        /* if no one is waiting on us, we'll free the md thread struct
-         * and exit, otherwise we let the waiter clean things up */
-        spin_lock_irqsave(&bitmap->lock, flags);
-        if ((dmn = *daemon)) { /* no one is waiting, cleanup and exit */
-                *daemon = NULL;
-                spin_unlock_irqrestore(&bitmap->lock, flags);
-                kfree(dmn);
-                complete_and_exit(NULL, 0); /* do_exit not exported */
-        }
-        spin_unlock_irqrestore(&bitmap->lock, flags);
-}
-static void bitmap_writeback_daemon(mddev_t *mddev)
-{
-        struct bitmap *bitmap = mddev->bitmap;
-        struct page *page;
-        struct page_list *item;
-        int err = 0;
-        if (signal_pending(current)) {
-                printk(KERN_INFO
-                       "%s: bitmap writeback daemon got signal, exiting...\n",
-                       bmname(bitmap));
-                err = -EINTR;
-                goto out;
-        }
-        if (bitmap == NULL)
-                /* about to be stopped. */
-                return;
-        PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap));
-        /* wait on bitmap page writebacks */
-        while ((item = dequeue_page(bitmap))) {
-                page = item->page;
-                mempool_free(item, bitmap->write_pool);
-                PRINTK("wait on page writeback: %p\n", page);
-                wait_on_page_writeback(page);
-                PRINTK("finished page writeback: %p\n", page);
-                err = PageError(page);
-                put_page(page);
-                if (err) {
-                        printk(KERN_WARNING "%s: bitmap file writeback "
-                               "failed (page %lu): %d\n",
-                               bmname(bitmap), page->index, err);
-                        bitmap_file_kick(bitmap);
-                        goto out;
-                }
-        }
- out:
-        wake_up(&bitmap->write_wait);
-        if (err) {
-                printk(KERN_INFO "%s: bitmap writeback daemon exiting (%d)\n",
-                       bmname(bitmap), err);
-                daemon_exit(bitmap, &bitmap->writeback_daemon);
-        }
-}
-static mdk_thread_t *bitmap_start_daemon(struct bitmap *bitmap,
-                                void (*func)(mddev_t *), char *name)
-{
-        mdk_thread_t *daemon;
-        char namebuf[32];
-#ifdef INJECT_FATAL_FAULT_2
-        daemon = NULL;
-#else
-        sprintf(namebuf, "%%s_%s", name);
-        daemon = md_register_thread(func, bitmap->mddev, namebuf);
-#endif
-        if (!daemon) {
-                printk(KERN_ERR "%s: failed to start bitmap daemon\n",
-                        bmname(bitmap));
-                return ERR_PTR(-ECHILD);
-        }
-        md_wakeup_thread(daemon); /* start it running */
-        PRINTK("%s: %s daemon (pid %d) started...\n",
-                bmname(bitmap), name, daemon->tsk->pid);
-        return daemon;
-}
-static void bitmap_stop_daemon(struct bitmap *bitmap)
-{
-        /* the daemon can't stop itself... it'll just exit instead... */
-        if (bitmap->writeback_daemon && ! IS_ERR(bitmap->writeback_daemon) &&
-            current->pid != bitmap->writeback_daemon->tsk->pid) {
-                mdk_thread_t *daemon;
-                unsigned long flags;
-                spin_lock_irqsave(&bitmap->lock, flags);
-                daemon = bitmap->writeback_daemon;
-                bitmap->writeback_daemon = NULL;
-                spin_unlock_irqrestore(&bitmap->lock, flags);
-                if (daemon && ! IS_ERR(daemon))
-                        md_unregister_thread(daemon); /* destroy the thread */
-        }
-}
 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
                                            sector_t offset, int *blocks,
                                            int create)
@@ -1500,8 +1364,6 @@ static void bitmap_free(struct bitmap *bitmap)
        /* free all allocated memory */
-        mempool_destroy(bitmap->write_pool);
        if (bp) /* deallocate the page memory */
                for (k = 0; k < pages; k++)
                        if (bp[k].map && !bp[k].hijacked)
@@ -1549,20 +1411,20 @@ int bitmap_create(mddev_t *mddev)
                return -ENOMEM;
        spin_lock_init(&bitmap->lock);
-        bitmap->mddev = mddev;
+        atomic_set(&bitmap->pending_writes, 0);
-        spin_lock_init(&bitmap->write_lock);
-        INIT_LIST_HEAD(&bitmap->complete_pages);
        init_waitqueue_head(&bitmap->write_wait);
-        bitmap->write_pool = mempool_create_kmalloc_pool(WRITE_POOL_SIZE,
-                                                sizeof(struct page_list));
+        bitmap->mddev = mddev;
-        err = -ENOMEM;
-        if (!bitmap->write_pool)
-                goto error;
        bitmap->file = file;
        bitmap->offset = mddev->bitmap_offset;
-        if (file) get_file(file);
+        if (file) {
+                get_file(file);
+                do_sync_file_range(file, 0, LLONG_MAX,
+                                   SYNC_FILE_RANGE_WAIT_BEFORE |
+                                   SYNC_FILE_RANGE_WRITE |
+                                   SYNC_FILE_RANGE_WAIT_AFTER);
+        }
        /* read superblock from bitmap file (this sets bitmap->chunksize) */
        err = bitmap_read_sb(bitmap);
        if (err)
@@ -1594,8 +1456,6 @@ int bitmap_create(mddev_t *mddev)
        if (!bitmap->bp)
                goto error;
-        bitmap->flags |= BITMAP_ACTIVE;
        /* now that we have some pages available, initialize the in-memory
         * bitmap from the on-disk bitmap */
        start = 0;
@@ -1613,15 +1473,6 @@ int bitmap_create(mddev_t *mddev)
        mddev->bitmap = bitmap;
-        if (file)
-                /* kick off the bitmap writeback daemon */
-                bitmap->writeback_daemon =
-                        bitmap_start_daemon(bitmap,
-                                            bitmap_writeback_daemon,
-                                            "bitmap_wb");
-        if (IS_ERR(bitmap->writeback_daemon))
-                return PTR_ERR(bitmap->writeback_daemon);
        mddev->thread->timeout = bitmap->daemon_sleep * HZ;
        return bitmap_update_sb(bitmap);
@@ -1638,4 +1489,3 @@ EXPORT_SYMBOL(bitmap_start_sync);
 EXPORT_SYMBOL(bitmap_end_sync);
 EXPORT_SYMBOL(bitmap_unplug);
 EXPORT_SYMBOL(bitmap_close_sync);
-EXPORT_SYMBOL(bitmap_daemon_work);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 61a590bb6241..bdbd34993a80 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -5,6 +5,7 @@
 * This file is released under the GPL.
 */
+#include <linux/err.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -20,7 +21,7 @@
 #include "dm.h"
-#define PFX     "crypt: "
+#define DM_MSG_PREFIX "crypt"
 /*
 * per bio private data
@@ -78,11 +79,13 @@ struct crypt_config {
         */
        struct crypt_iv_operations *iv_gen_ops;
        char *iv_mode;
-        void *iv_gen_private;
+        struct crypto_cipher *iv_gen_private;
        sector_t iv_offset;
        unsigned int iv_size;
-        struct crypto_tfm *tfm;
+        char cipher[CRYPTO_MAX_ALG_NAME];
+        char chainmode[CRYPTO_MAX_ALG_NAME];
+        struct crypto_blkcipher *tfm;
        unsigned int key_size;
        u8 key[0];
 };
@@ -96,12 +99,12 @@ static kmem_cache_t *_crypt_io_pool;
 /*
 * Different IV generation algorithms:
 *
- * plain: the initial vector is the 32-bit low-endian version of the sector
+ * plain: the initial vector is the 32-bit little-endian version of the sector
 *        number, padded with zeros if neccessary.
 *
- * ess_iv: "encrypted sector|salt initial vector", the sector number is
+ * essiv: "encrypted sector|salt initial vector", the sector number is
- *         encrypted with the bulk cipher using a salt as key. The salt
+ *        encrypted with the bulk cipher using a salt as key. The salt
- *         should be derived from the bulk cipher's key via hashing.
+ *        should be derived from the bulk cipher's key via hashing.
 *
 * plumb: unimplemented, see:
 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
@@ -118,88 +121,84 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
                              const char *opts)
 {
-        struct crypto_tfm *essiv_tfm;
+        struct crypto_cipher *essiv_tfm;
-        struct crypto_tfm *hash_tfm;
+        struct crypto_hash *hash_tfm;
+        struct hash_desc desc;
        struct scatterlist sg;
        unsigned int saltsize;
        u8 *salt;
+        int err;
        if (opts == NULL) {
-                ti->error = PFX "Digest algorithm missing for ESSIV mode";
+                ti->error = "Digest algorithm missing for ESSIV mode";
                return -EINVAL;
        }
        /* Hash the cipher key with the given hash algorithm */
-        hash_tfm = crypto_alloc_tfm(opts, CRYPTO_TFM_REQ_MAY_SLEEP);
+        hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
-        if (hash_tfm == NULL) {
+        if (IS_ERR(hash_tfm)) {
-                ti->error = PFX "Error initializing ESSIV hash";
+                ti->error = "Error initializing ESSIV hash";
-                return -EINVAL;
+                return PTR_ERR(hash_tfm);
        }
-        if (crypto_tfm_alg_type(hash_tfm) != CRYPTO_ALG_TYPE_DIGEST) {
+        saltsize = crypto_hash_digestsize(hash_tfm);
-                ti->error = PFX "Expected digest algorithm for ESSIV hash";
-                crypto_free_tfm(hash_tfm);
-                return -EINVAL;
-        }
-        saltsize = crypto_tfm_alg_digestsize(hash_tfm);
        salt = kmalloc(saltsize, GFP_KERNEL);
        if (salt == NULL) {
-                ti->error = PFX "Error kmallocing salt storage in ESSIV";
+                ti->error = "Error kmallocing salt storage in ESSIV";
-                crypto_free_tfm(hash_tfm);
+                crypto_free_hash(hash_tfm);
                return -ENOMEM;
        }
        sg_set_buf(&sg, cc->key, cc->key_size);
-        crypto_digest_digest(hash_tfm, &sg, 1, salt);
+        desc.tfm = hash_tfm;
-        crypto_free_tfm(hash_tfm);
+        desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
+        crypto_free_hash(hash_tfm);
+        if (err) {
+                ti->error = "Error calculating hash in ESSIV";
+                return err;
+        }
        /* Setup the essiv_tfm with the given salt */
-        essiv_tfm = crypto_alloc_tfm(crypto_tfm_alg_name(cc->tfm),
+        essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
-                                     CRYPTO_TFM_MODE_ECB |
+        if (IS_ERR(essiv_tfm)) {
-                                     CRYPTO_TFM_REQ_MAY_SLEEP);
+                ti->error = "Error allocating crypto tfm for ESSIV";
-        if (essiv_tfm == NULL) {
-                ti->error = PFX "Error allocating crypto tfm for ESSIV";
                kfree(salt);
-                return -EINVAL;
+                return PTR_ERR(essiv_tfm);
        }
-        if (crypto_tfm_alg_blocksize(essiv_tfm)
+        if (crypto_cipher_blocksize(essiv_tfm) !=
-            != crypto_tfm_alg_ivsize(cc->tfm)) {
+            crypto_blkcipher_ivsize(cc->tfm)) {
-                ti->error = PFX "Block size of ESSIV cipher does "
+                ti->error = "Block size of ESSIV cipher does "
                                "not match IV size of block cipher";
-                crypto_free_tfm(essiv_tfm);
+                crypto_free_cipher(essiv_tfm);
                kfree(salt);
                return -EINVAL;
        }
-        if (crypto_cipher_setkey(essiv_tfm, salt, saltsize) < 0) {
+        err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
-                ti->error = PFX "Failed to set key for ESSIV cipher";
+        if (err) {
-                crypto_free_tfm(essiv_tfm);
+                ti->error = "Failed to set key for ESSIV cipher";
+                crypto_free_cipher(essiv_tfm);
                kfree(salt);
-                return -EINVAL;
+                return err;
        }
        kfree(salt);
-        cc->iv_gen_private = (void *)essiv_tfm;
+        cc->iv_gen_private = essiv_tfm;
        return 0;
 }
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
-        crypto_free_tfm((struct crypto_tfm *)cc->iv_gen_private);
+        crypto_free_cipher(cc->iv_gen_private);
        cc->iv_gen_private = NULL;
 }
 static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 {
-        struct scatterlist sg;
        memset(iv, 0, cc->iv_size);
        *(u64 *)iv = cpu_to_le64(sector);
+        crypto_cipher_encrypt_one(cc->iv_gen_private, iv, iv);
-        sg_set_buf(&sg, iv, cc->iv_size);
-        crypto_cipher_encrypt((struct crypto_tfm *)cc->iv_gen_private,
-                              &sg, &sg, cc->iv_size);
        return 0;
 }
@@ -220,6 +219,11 @@ crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
                          int write, sector_t sector)
 {
        u8 iv[cc->iv_size];
+        struct blkcipher_desc desc = {
+                .tfm = cc->tfm,
+                .info = iv,
+                .flags = CRYPTO_TFM_REQ_MAY_SLEEP,
+        };
        int r;
        if (cc->iv_gen_ops) {
@@ -228,14 +232,14 @@ crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
                        return r;
                if (write)
-                        r = crypto_cipher_encrypt_iv(cc->tfm, out, in, length, iv);
+                        r = crypto_blkcipher_encrypt_iv(&desc, out, in, length);
                else
-                        r = crypto_cipher_decrypt_iv(cc->tfm, out, in, length, iv);
+                        r = crypto_blkcipher_decrypt_iv(&desc, out, in, length);
        } else {
                if (write)
-                        r = crypto_cipher_encrypt(cc->tfm, out, in, length);
+                        r = crypto_blkcipher_encrypt(&desc, out, in, length);
                else
-                        r = crypto_cipher_decrypt(cc->tfm, out, in, length);
+                        r = crypto_blkcipher_decrypt(&desc, out, in, length);
        }
        return r;
@@ -510,18 +514,17 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
 static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
        struct crypt_config *cc;
-        struct crypto_tfm *tfm;
+        struct crypto_blkcipher *tfm;
        char *tmp;
        char *cipher;
        char *chainmode;
        char *ivmode;
        char *ivopts;
-        unsigned int crypto_flags;
        unsigned int key_size;
        unsigned long long tmpll;
        if (argc != 5) {
-                ti->error = PFX "Not enough arguments";
+                ti->error = "Not enough arguments";
                return -EINVAL;
        }
@@ -532,21 +535,21 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ivmode = strsep(&ivopts, ":");
        if (tmp)
-                DMWARN(PFX "Unexpected additional cipher options");
+                DMWARN("Unexpected additional cipher options");
        key_size = strlen(argv[1]) >> 1;
        cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
        if (cc == NULL) {
                ti->error =
-                        PFX "Cannot allocate transparent encryption context";
+                        "Cannot allocate transparent encryption context";
                return -ENOMEM;
        }
        cc->key_size = key_size;
        if ((!key_size && strcmp(argv[1], "-") != 0) ||
            (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) {
-                ti->error = PFX "Error decoding key";
+                ti->error = "Error decoding key";
                goto bad1;
        }
@@ -556,31 +559,25 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                ivmode = "plain";
        }
-        /* Choose crypto_flags according to chainmode */
+        if (strcmp(chainmode, "ecb") && !ivmode) {
-        if (strcmp(chainmode, "cbc") == 0)
+                ti->error = "This chaining mode requires an IV mechanism";
-                crypto_flags = CRYPTO_TFM_MODE_CBC;
-        else if (strcmp(chainmode, "ecb") == 0)
-                crypto_flags = CRYPTO_TFM_MODE_ECB;
-        else {
-                ti->error = PFX "Unknown chaining mode";
                goto bad1;
        }
-        if (crypto_flags != CRYPTO_TFM_MODE_ECB && !ivmode) {
+        if (snprintf(cc->cipher, CRYPTO_MAX_ALG_NAME, "%s(%s)", chainmode, 
-                ti->error = PFX "This chaining mode requires an IV mechanism";
+                     cipher) >= CRYPTO_MAX_ALG_NAME) {
+                ti->error = "Chain mode + cipher name is too long";
                goto bad1;
        }
-        tfm = crypto_alloc_tfm(cipher, crypto_flags | CRYPTO_TFM_REQ_MAY_SLEEP);
+        tfm = crypto_alloc_blkcipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
-        if (!tfm) {
+        if (IS_ERR(tfm)) {
-                ti->error = PFX "Error allocating crypto tfm";
+                ti->error = "Error allocating crypto tfm";
                goto bad1;
        }
-        if (crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER) {
-                ti->error = PFX "Expected cipher algorithm";
-                goto bad2;
-        }
+        strcpy(cc->cipher, cipher);
+        strcpy(cc->chainmode, chainmode);
        cc->tfm = tfm;
        /*
@@ -595,7 +592,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        else if (strcmp(ivmode, "essiv") == 0)
                cc->iv_gen_ops = &crypt_iv_essiv_ops;
        else {
-                ti->error = PFX "Invalid IV mode";
+                ti->error = "Invalid IV mode";
                goto bad2;
        }
@@ -603,14 +600,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
            cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
                goto bad2;
-        if (tfm->crt_cipher.cit_decrypt_iv && tfm->crt_cipher.cit_encrypt_iv)
+        cc->iv_size = crypto_blkcipher_ivsize(tfm);
+        if (cc->iv_size)
                /* at least a 64 bit sector number should fit in our buffer */
-                cc->iv_size = max(crypto_tfm_alg_ivsize(tfm),
+                cc->iv_size = max(cc->iv_size,
                                  (unsigned int)(sizeof(u64) / sizeof(u8)));
        else {
-                cc->iv_size = 0;
                if (cc->iv_gen_ops) {
-                        DMWARN(PFX "Selected cipher does not support IVs");
+                        DMWARN("Selected cipher does not support IVs");
                        if (cc->iv_gen_ops->dtr)
                                cc->iv_gen_ops->dtr(cc);
                        cc->iv_gen_ops = NULL;
@@ -619,36 +616,36 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
        if (!cc->io_pool) {
-                ti->error = PFX "Cannot allocate crypt io mempool";
+                ti->error = "Cannot allocate crypt io mempool";
                goto bad3;
        }
        cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
        if (!cc->page_pool) {
-                ti->error = PFX "Cannot allocate page mempool";
+                ti->error = "Cannot allocate page mempool";
                goto bad4;
        }
-        if (tfm->crt_cipher.cit_setkey(tfm, cc->key, key_size) < 0) {
+        if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) {
-                ti->error = PFX "Error setting key";
+                ti->error = "Error setting key";
                goto bad5;
        }
        if (sscanf(argv[2], "%llu", &tmpll) != 1) {
-                ti->error = PFX "Invalid iv_offset sector";
+                ti->error = "Invalid iv_offset sector";
                goto bad5;
        }
        cc->iv_offset = tmpll;
        if (sscanf(argv[4], "%llu", &tmpll) != 1) {
-                ti->error = PFX "Invalid device sector";
+                ti->error = "Invalid device sector";
                goto bad5;
        }
        cc->start = tmpll;
        if (dm_get_device(ti, argv[3], cc->start, ti->len,
                          dm_table_get_mode(ti->table), &cc->dev)) {
-                ti->error = PFX "Device lookup failed";
+                ti->error = "Device lookup failed";
                goto bad5;
        }
@@ -657,7 +654,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                        *(ivopts - 1) = ':';
                cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL);
                if (!cc->iv_mode) {
-                        ti->error = PFX "Error kmallocing iv_mode string";
+                        ti->error = "Error kmallocing iv_mode string";
                        goto bad5;
                }
                strcpy(cc->iv_mode, ivmode);
@@ -675,7 +672,7 @@ bad3:
        if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
                cc->iv_gen_ops->dtr(cc);
 bad2:
-        crypto_free_tfm(tfm);
+        crypto_free_blkcipher(tfm);
 bad1:
        /* Must zero key material before freeing */
        memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
@@ -693,7 +690,7 @@ static void crypt_dtr(struct dm_target *ti)
        kfree(cc->iv_mode);
        if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
                cc->iv_gen_ops->dtr(cc);
-        crypto_free_tfm(cc->tfm);
+        crypto_free_blkcipher(cc->tfm);
        dm_put_device(ti, cc->dev);
        /* Must zero key material before freeing */
@@ -858,18 +855,9 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
                break;
        case STATUSTYPE_TABLE:
-                cipher = crypto_tfm_alg_name(cc->tfm);
+                cipher = crypto_blkcipher_name(cc->tfm);
-                switch(cc->tfm->crt_cipher.cit_mode) {
+                chainmode = cc->chainmode;
-                case CRYPTO_TFM_MODE_CBC:
-                        chainmode = "cbc";
-                        break;
-                case CRYPTO_TFM_MODE_ECB:
-                        chainmode = "ecb";
-                        break;
-                default:
-                        BUG();
-                }
                if (cc->iv_mode)
                        DMEMIT("%s-%s-%s ", cipher, chainmode, cc->iv_mode);
@@ -918,13 +906,13 @@ static int __init dm_crypt_init(void)
        _kcryptd_workqueue = create_workqueue("kcryptd");
        if (!_kcryptd_workqueue) {
                r = -ENOMEM;
-                DMERR(PFX "couldn't create kcryptd");
+                DMERR("couldn't create kcryptd");
                goto bad1;
        }
        r = dm_register_target(&crypt_target);
        if (r < 0) {
-                DMERR(PFX "register failed %d", r);
+                DMERR("register failed %d", r);
                goto bad2;
        }
@@ -942,7 +930,7 @@ static void __exit dm_crypt_exit(void)
        int r = dm_unregister_target(&crypt_target);
        if (r < 0)
-                DMERR(PFX "unregister failed %d", r);
+                DMERR("unregister failed %d", r);
        destroy_workqueue(_kcryptd_workqueue);
        kmem_cache_destroy(_crypt_io_pool);
diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c
index c7067674dcb7..2a374ccb30dd 100644
--- a/drivers/md/dm-emc.c
+++ b/drivers/md/dm-emc.c
@@ -12,6 +12,8 @@
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
+#define DM_MSG_PREFIX "multipath emc"
 struct emc_handler {
        spinlock_t lock;
@@ -66,7 +68,7 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size)
        bio = bio_alloc(GFP_ATOMIC, 1);
        if (!bio) {
-                DMERR("dm-emc: get_failover_bio: bio_alloc() failed.");
+                DMERR("get_failover_bio: bio_alloc() failed.");
                return NULL;
        }
@@ -78,13 +80,13 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size)
        page = alloc_page(GFP_ATOMIC);
        if (!page) {
-                DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
+                DMERR("get_failover_bio: alloc_page() failed.");
                bio_put(bio);
                return NULL;
        }
        if (bio_add_page(bio, page, data_size, 0) != data_size) {
-                DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
+                DMERR("get_failover_bio: alloc_page() failed.");
                __free_page(page);
                bio_put(bio);
                return NULL;
@@ -103,7 +105,7 @@ static struct request *get_failover_req(struct emc_handler *h,
        /* FIXME: Figure out why it fails with GFP_ATOMIC. */
        rq = blk_get_request(q, WRITE, __GFP_WAIT);
        if (!rq) {
-                DMERR("dm-emc: get_failover_req: blk_get_request failed");
+                DMERR("get_failover_req: blk_get_request failed");
                return NULL;
        }
@@ -160,7 +162,7 @@ static struct request *emc_trespass_get(struct emc_handler *h,
        bio = get_failover_bio(path, data_size);
        if (!bio) {
-                DMERR("dm-emc: emc_trespass_get: no bio");
+                DMERR("emc_trespass_get: no bio");
                return NULL;
        }
@@ -173,7 +175,7 @@ static struct request *emc_trespass_get(struct emc_handler *h,
        /* get request for block layer packet command */
        rq = get_failover_req(h, bio, path);
        if (!rq) {
-                DMERR("dm-emc: emc_trespass_get: no rq");
+                DMERR("emc_trespass_get: no rq");
                free_bio(bio);
                return NULL;
        }
@@ -200,18 +202,18 @@ static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
         * initial state passed into us and then get an update here.
         */
        if (!q) {
-                DMINFO("dm-emc: emc_pg_init: no queue");
+                DMINFO("emc_pg_init: no queue");
                goto fail_path;
        }
        /* FIXME: The request should be pre-allocated. */
        rq = emc_trespass_get(hwh->context, path);
        if (!rq) {
-                DMERR("dm-emc: emc_pg_init: no rq");
+                DMERR("emc_pg_init: no rq");
                goto fail_path;
        }
-        DMINFO("dm-emc: emc_pg_init: sending switch-over command");
+        DMINFO("emc_pg_init: sending switch-over command");
        elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
        return;
@@ -241,18 +243,18 @@ static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
                hr = 0;
                short_trespass = 0;
        } else if (argc != 2) {
-                DMWARN("dm-emc hwhandler: incorrect number of arguments");
+                DMWARN("incorrect number of arguments");
                return -EINVAL;
        } else {
                if ((sscanf(argv[0], "%u", &short_trespass) != 1)
                        || (short_trespass > 1)) {
-                        DMWARN("dm-emc: invalid trespass mode selected");
+                        DMWARN("invalid trespass mode selected");
                        return -EINVAL;
                }
                if ((sscanf(argv[1], "%u", &hr) != 1)
                        || (hr > 1)) {
-                        DMWARN("dm-emc: invalid honor reservation flag selected");
+                        DMWARN("invalid honor reservation flag selected");
                        return -EINVAL;
                }
        }
@@ -264,14 +266,14 @@ static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
        hwh->context = h;
        if ((h->short_trespass = short_trespass))
-                DMWARN("dm-emc: short trespass command will be send");
+                DMWARN("short trespass command will be send");
        else
-                DMWARN("dm-emc: long trespass command will be send");
+                DMWARN("long trespass command will be send");
        if ((h->hr = hr))
-                DMWARN("dm-emc: honor reservation bit will be set");
+                DMWARN("honor reservation bit will be set");
        else
-                DMWARN("dm-emc: honor reservation bit will not be set (default)");
+                DMWARN("honor reservation bit will not be set (default)");
        return 0;
 }
@@ -336,9 +338,9 @@ static int __init dm_emc_init(void)
        int r = dm_register_hw_handler(&emc_hwh);
        if (r < 0)
-                DMERR("emc: register failed %d", r);
+                DMERR("register failed %d", r);
-        DMINFO("dm-emc version 0.0.3 loaded");
+        DMINFO("version 0.0.3 loaded");
        return r;
 }
@@ -348,7 +350,7 @@ static void __exit dm_emc_exit(void)
        int r = dm_unregister_hw_handler(&emc_hwh);
        if (r < 0)
-                DMERR("emc: unregister failed %d", r);
+                DMERR("unregister failed %d", r);
 }
 module_init(dm_emc_init);
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index cc07bbebbb16..d12379b5cdb5 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -16,6 +16,8 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#define DM_MSG_PREFIX "snapshots"
 /*-----------------------------------------------------------------
 * Persistent snapshots, by persistent we mean that the snapshot
 * will survive a reboot.
@@ -91,7 +93,6 @@ struct pstore {
        struct dm_snapshot *snap;       /* up pointer to my snapshot */
        int version;
        int valid;
-        uint32_t chunk_size;
        uint32_t exceptions_per_area;
        /*
@@ -133,7 +134,7 @@ static int alloc_area(struct pstore *ps)
        int r = -ENOMEM;
        size_t len;
-        len = ps->chunk_size << SECTOR_SHIFT;
+        len = ps->snap->chunk_size << SECTOR_SHIFT;
        /*
         * Allocate the chunk_size block of memory that will hold
@@ -160,8 +161,8 @@ static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
        unsigned long bits;
        where.bdev = ps->snap->cow->bdev;
-        where.sector = ps->chunk_size * chunk;
+        where.sector = ps->snap->chunk_size * chunk;
-        where.count = ps->chunk_size;
+        where.count = ps->snap->chunk_size;
        return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
 }
@@ -188,7 +189,7 @@ static int area_io(struct pstore *ps, uint32_t area, int rw)
 static int zero_area(struct pstore *ps, uint32_t area)
 {
-        memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
+        memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
        return area_io(ps, area, WRITE);
 }
@@ -196,6 +197,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 {
        int r;
        struct disk_header *dh;
+        chunk_t chunk_size;
        r = chunk_io(ps, 0, READ);
        if (r)
@@ -210,8 +212,29 @@ static int read_header(struct pstore *ps, int *new_snapshot)
                *new_snapshot = 0;
                ps->valid = le32_to_cpu(dh->valid);
                ps->version = le32_to_cpu(dh->version);
-                ps->chunk_size = le32_to_cpu(dh->chunk_size);
+                chunk_size = le32_to_cpu(dh->chunk_size);
+                if (ps->snap->chunk_size != chunk_size) {
+                        DMWARN("chunk size %llu in device metadata overrides "
+                               "table chunk size of %llu.",
+                               (unsigned long long)chunk_size,
+                               (unsigned long long)ps->snap->chunk_size);
+                        /* We had a bogus chunk_size. Fix stuff up. */
+                        dm_io_put(sectors_to_pages(ps->snap->chunk_size));
+                        free_area(ps);
+                        ps->snap->chunk_size = chunk_size;
+                        ps->snap->chunk_mask = chunk_size - 1;
+                        ps->snap->chunk_shift = ffs(chunk_size) - 1;
+                        r = alloc_area(ps);
+                        if (r)
+                                return r;
+                        r = dm_io_get(sectors_to_pages(chunk_size));
+                        if (r)
+                                return r;
+                }
        } else {
                DMWARN("Invalid/corrupt snapshot");
                r = -ENXIO;
@@ -224,13 +247,13 @@ static int write_header(struct pstore *ps)
 {
        struct disk_header *dh;
-        memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
+        memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
        dh = (struct disk_header *) ps->area;
        dh->magic = cpu_to_le32(SNAP_MAGIC);
        dh->valid = cpu_to_le32(ps->valid);
        dh->version = cpu_to_le32(ps->version);
-        dh->chunk_size = cpu_to_le32(ps->chunk_size);
+        dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
        return chunk_io(ps, 0, WRITE);
 }
@@ -365,7 +388,7 @@ static void persistent_destroy(struct exception_store *store)
 {
        struct pstore *ps = get_info(store);
-        dm_io_put(sectors_to_pages(ps->chunk_size));
+        dm_io_put(sectors_to_pages(ps->snap->chunk_size));
        vfree(ps->callbacks);
        free_area(ps);
        kfree(ps);
@@ -384,6 +407,16 @@ static int persistent_read_metadata(struct exception_store *store)
                return r;
        /*
+         * Now we know correct chunk_size, complete the initialisation.
+         */
+        ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
+                                  sizeof(struct disk_exception);
+        ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
+                        sizeof(*ps->callbacks));
+        if (!ps->callbacks)
+                return -ENOMEM;
+        /*
         * Do we need to setup a new snapshot ?
         */
        if (new_snapshot) {
@@ -533,9 +566,6 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
        ps->snap = store->snap;
        ps->valid = 1;
        ps->version = SNAPSHOT_DISK_VERSION;
-        ps->chunk_size = chunk_size;
-        ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
-            sizeof(struct disk_exception);
        ps->next_free = 2;      /* skipping the header and first area */
        ps->current_committed = 0;
@@ -543,18 +573,9 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
        if (r)
                goto bad;
-        /*
-         * Allocate space for all the callbacks.
-         */
        ps->callback_count = 0;
        atomic_set(&ps->pending_count, 0);
-        ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
+        ps->callbacks = NULL;
-                                   sizeof(*ps->callbacks));
-        if (!ps->callbacks) {
-                r = -ENOMEM;
-                goto bad;
-        }
        store->destroy = persistent_destroy;
        store->read_metadata = persistent_read_metadata;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 8edd6435414d..d13bb15a8a02 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
- * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004 - 2006 Red Hat, Inc. All rights reserved.
 *
 * This file is released under the GPL.
 */
@@ -13,12 +13,12 @@
 #include <linux/init.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/dm-ioctl.h>
 #include <linux/hdreg.h>
 #include <asm/uaccess.h>
+#define DM_MSG_PREFIX "ioctl"
 #define DM_DRIVER_EMAIL "dm-devel@redhat.com"
 /*-----------------------------------------------------------------
@@ -48,7 +48,7 @@ struct vers_iter {
 static struct list_head _name_buckets[NUM_BUCKETS];
 static struct list_head _uuid_buckets[NUM_BUCKETS];
-static void dm_hash_remove_all(void);
+static void dm_hash_remove_all(int keep_open_devices);
 /*
 * Guards access to both hash tables.
@@ -67,14 +67,12 @@ static int dm_hash_init(void)
 {
        init_buckets(_name_buckets);
        init_buckets(_uuid_buckets);
-        devfs_mk_dir(DM_DIR);
        return 0;
 }
 static void dm_hash_exit(void)
 {
-        dm_hash_remove_all();
+        dm_hash_remove_all(0);
-        devfs_remove(DM_DIR);
 }
 /*-----------------------------------------------------------------
@@ -102,8 +100,10 @@ static struct hash_cell *__get_name_cell(const char *str)
        unsigned int h = hash_str(str);
        list_for_each_entry (hc, _name_buckets + h, name_list)
-                if (!strcmp(hc->name, str))
+                if (!strcmp(hc->name, str)) {
+                        dm_get(hc->md);
                        return hc;
+                }
        return NULL;
 }
@@ -114,8 +114,10 @@ static struct hash_cell *__get_uuid_cell(const char *str)
        unsigned int h = hash_str(str);
        list_for_each_entry (hc, _uuid_buckets + h, uuid_list)
-                if (!strcmp(hc->uuid, str))
+                if (!strcmp(hc->uuid, str)) {
+                        dm_get(hc->md);
                        return hc;
+                }
        return NULL;
 }
@@ -167,31 +169,12 @@ static void free_cell(struct hash_cell *hc)
 }
 /*
- * devfs stuff.
- */
-static int register_with_devfs(struct hash_cell *hc)
-{
-        struct gendisk *disk = dm_disk(hc->md);
-        devfs_mk_bdev(MKDEV(disk->major, disk->first_minor),
-                      S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
-                      DM_DIR "/%s", hc->name);
-        return 0;
-}
-static int unregister_with_devfs(struct hash_cell *hc)
-{
-        devfs_remove(DM_DIR"/%s", hc->name);
-        return 0;
-}
-/*
 * The kdev_t and uuid of a device can never change once it is
 * initially inserted.
 */
 static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
 {
-        struct hash_cell *cell;
+        struct hash_cell *cell, *hc;
        /*
         * Allocate the new cells.
@@ -204,19 +187,23 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
         * Insert the cell into both hash tables.
         */
        down_write(&_hash_lock);
-        if (__get_name_cell(name))
+        hc = __get_name_cell(name);
+        if (hc) {
+                dm_put(hc->md);
                goto bad;
+        }
        list_add(&cell->name_list, _name_buckets + hash_str(name));
        if (uuid) {
-                if (__get_uuid_cell(uuid)) {
+                hc = __get_uuid_cell(uuid);
+                if (hc) {
                        list_del(&cell->name_list);
+                        dm_put(hc->md);
                        goto bad;
                }
                list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
        }
-        register_with_devfs(cell);
        dm_get(md);
        dm_set_mdptr(md, cell);
        up_write(&_hash_lock);
@@ -236,7 +223,6 @@ static void __hash_remove(struct hash_cell *hc)
        /* remove from the dev hash */
        list_del(&hc->uuid_list);
        list_del(&hc->name_list);
-        unregister_with_devfs(hc);
        dm_set_mdptr(hc->md, NULL);
        table = dm_get_table(hc->md);
@@ -251,19 +237,41 @@ static void __hash_remove(struct hash_cell *hc)
        free_cell(hc);
 }
-static void dm_hash_remove_all(void)
+static void dm_hash_remove_all(int keep_open_devices)
 {
-        int i;
+        int i, dev_skipped, dev_removed;
        struct hash_cell *hc;
        struct list_head *tmp, *n;
        down_write(&_hash_lock);
+retry:
+        dev_skipped = dev_removed = 0;
        for (i = 0; i < NUM_BUCKETS; i++) {
                list_for_each_safe (tmp, n, _name_buckets + i) {
                        hc = list_entry(tmp, struct hash_cell, name_list);
+                        if (keep_open_devices &&
+                            dm_lock_for_deletion(hc->md)) {
+                                dev_skipped++;
+                                continue;
+                        }
                        __hash_remove(hc);
+                        dev_removed = 1;
                }
        }
+        /*
+         * Some mapped devices may be using other mapped devices, so if any
+         * still exist, repeat until we make no further progress.
+         */
+        if (dev_skipped) {
+                if (dev_removed)
+                        goto retry;
+                DMWARN("remove_all left %d open device(s)", dev_skipped);
+        }
        up_write(&_hash_lock);
 }
@@ -289,6 +297,7 @@ static int dm_hash_rename(const char *old, const char *new)
        if (hc) {
                DMWARN("asked to rename to an already existing name %s -> %s",
                       old, new);
+                dm_put(hc->md);
                up_write(&_hash_lock);
                kfree(new_name);
                return -EBUSY;
@@ -309,16 +318,11 @@ static int dm_hash_rename(const char *old, const char *new)
        /*
         * rename and move the name cell.
         */
-        unregister_with_devfs(hc);
        list_del(&hc->name_list);
        old_name = hc->name;
        hc->name = new_name;
        list_add(&hc->name_list, _name_buckets + hash_str(new_name));
-        /* rename the device node in devfs */
-        register_with_devfs(hc);
        /*
         * Wake up any dm event waiters.
         */
@@ -328,6 +332,7 @@ static int dm_hash_rename(const char *old, const char *new)
                dm_table_put(table);
        }
+        dm_put(hc->md);
        up_write(&_hash_lock);
        kfree(old_name);
        return 0;
@@ -344,7 +349,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
 static int remove_all(struct dm_ioctl *param, size_t param_size)
 {
-        dm_hash_remove_all();
+        dm_hash_remove_all(1);
        param->data_size = 0;
        return 0;
 }
@@ -524,7 +529,6 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 {
        struct gendisk *disk = dm_disk(md);
        struct dm_table *table;
-        struct block_device *bdev;
        param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
                          DM_ACTIVE_PRESENT_FLAG);
@@ -534,20 +538,12 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
        param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
-        if (!(param->flags & DM_SKIP_BDGET_FLAG)) {
+        /*
-                bdev = bdget_disk(disk, 0);
+         * Yes, this will be out of date by the time it gets back
-                if (!bdev)
+         * to userland, but it is still very useful for
-                        return -ENXIO;
+         * debugging.
+         */
-                /*
+        param->open_count = dm_open_count(md);
-                 * Yes, this will be out of date by the time it gets back
-                 * to userland, but it is still very useful for
-                 * debugging.
-                 */
-                param->open_count = bdev->bd_openers;
-                bdput(bdev);
-        } else
-                param->open_count = -1;
        if (disk->policy)
                param->flags |= DM_READONLY_FLAG;
@@ -567,7 +563,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 static int dev_create(struct dm_ioctl *param, size_t param_size)
 {
-        int r;
+        int r, m = DM_ANY_MINOR;
        struct mapped_device *md;
        r = check_name(param->name);
@@ -575,10 +571,9 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
                return r;
        if (param->flags & DM_PERSISTENT_DEV_FLAG)
-                r = dm_create_with_minor(MINOR(huge_decode_dev(param->dev)), &md);
+                m = MINOR(huge_decode_dev(param->dev));
-        else
-                r = dm_create(&md);
+        r = dm_create(m, &md);
        if (r)
                return r;
@@ -611,10 +606,8 @@ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
                return __get_name_cell(param->name);
        md = dm_get_md(huge_decode_dev(param->dev));
-        if (md) {
+        if (md)
                mdptr = dm_get_mdptr(md);
-                dm_put(md);
-        }
        return mdptr;
 }
@@ -628,7 +621,6 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
        hc = __find_device_hash_cell(param);
        if (hc) {
                md = hc->md;
-                dm_get(md);
                /*
                 * Sneakily write in both the name and the uuid
@@ -653,6 +645,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
 static int dev_remove(struct dm_ioctl *param, size_t param_size)
 {
        struct hash_cell *hc;
+        struct mapped_device *md;
+        int r;
        down_write(&_hash_lock);
        hc = __find_device_hash_cell(param);
@@ -663,8 +657,22 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
                return -ENXIO;
        }
+        md = hc->md;
+        /*
+         * Ensure the device is not open and nothing further can open it.
+         */
+        r = dm_lock_for_deletion(md);
+        if (r) {
+                DMWARN("unable to remove open device %s", hc->name);
+                up_write(&_hash_lock);
+                dm_put(md);
+                return r;
+        }
        __hash_remove(hc);
        up_write(&_hash_lock);
+        dm_put(md);
        param->data_size = 0;
        return 0;
 }
@@ -790,7 +798,6 @@ static int do_resume(struct dm_ioctl *param)
        }
        md = hc->md;
-        dm_get(md);
        new_map = hc->new_map;
        hc->new_map = NULL;
@@ -1078,6 +1085,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 {
        int r;
        struct hash_cell *hc;
+        struct mapped_device *md;
        down_write(&_hash_lock);
@@ -1096,7 +1104,9 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
        param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
        r = __dev_status(hc->md, param);
+        md = hc->md;
        up_write(&_hash_lock);
+        dm_put(md);
        return r;
 }
@@ -1462,7 +1472,6 @@ static struct file_operations _ctl_fops = {
 static struct miscdevice _dm_misc = {
        .minor          = MISC_DYNAMIC_MINOR,
        .name           = DM_NAME,
-        .devfs_name     = "mapper/control",
        .fops           = &_ctl_fops
 };
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index daf586c0898d..47b3c62bbdb8 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -12,6 +12,8 @@
 #include <linux/bio.h>
 #include <linux/slab.h>
+#define DM_MSG_PREFIX "linear"
 /*
 * Linear: maps a linear range of a device.
 */
@@ -29,7 +31,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        unsigned long long tmp;
        if (argc != 2) {
-                ti->error = "dm-linear: Invalid argument count";
+                ti->error = "Invalid argument count";
                return -EINVAL;
        }
@@ -111,7 +113,7 @@ int __init dm_linear_init(void)
        int r = dm_register_target(&linear_target);
        if (r < 0)
-                DMERR("linear: register failed %d", r);
+                DMERR("register failed %d", r);
        return r;
 }
@@ -121,5 +123,5 @@ void dm_linear_exit(void)
        int r = dm_unregister_target(&linear_target);
        if (r < 0)
-                DMERR("linear: unregister failed %d", r);
+                DMERR("unregister failed %d", r);
 }
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index d73779a42417..64b764bd02cc 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -12,6 +12,8 @@
 #include "dm-log.h"
 #include "dm-io.h"
+#define DM_MSG_PREFIX "mirror log"
 static LIST_HEAD(_log_types);
 static DEFINE_SPINLOCK(_lock);
@@ -155,8 +157,6 @@ struct log_c {
        struct io_region header_location;
        struct log_header *disk_header;
-        struct io_region bits_location;
 };
 /*
@@ -241,43 +241,21 @@ static inline int write_header(struct log_c *log)
 }
 /*----------------------------------------------------------------
- * Bits IO
- *--------------------------------------------------------------*/
-static int read_bits(struct log_c *log)
-{
-        int r;
-        unsigned long ebits;
-        r = dm_io_sync_vm(1, &log->bits_location, READ,
-                          log->clean_bits, &ebits);
-        if (r)
-                return r;
-        return 0;
-}
-static int write_bits(struct log_c *log)
-{
-        unsigned long ebits;
-        return dm_io_sync_vm(1, &log->bits_location, WRITE,
-                             log->clean_bits, &ebits);
-}
-/*----------------------------------------------------------------
 * core log constructor/destructor
 *
 * argv contains region_size followed optionally by [no]sync
 *--------------------------------------------------------------*/
 #define BYTE_SHIFT 3
-static int core_ctr(struct dirty_log *log, struct dm_target *ti,
+static int create_log_context(struct dirty_log *log, struct dm_target *ti,
-                    unsigned int argc, char **argv)
+                              unsigned int argc, char **argv,
+                              struct dm_dev *dev)
 {
        enum sync sync = DEFAULTSYNC;
        struct log_c *lc;
        uint32_t region_size;
        unsigned int region_count;
-        size_t bitset_size;
+        size_t bitset_size, buf_size;
        if (argc < 1 || argc > 2) {
                DMWARN("wrong number of arguments to mirror log");
@@ -319,22 +297,53 @@ static int core_ctr(struct dirty_log *log, struct dm_target *ti,
         * Work out how many "unsigned long"s we need to hold the bitset.
         */
        bitset_size = dm_round_up(region_count,
-                                  sizeof(unsigned long) << BYTE_SHIFT);
+                                  sizeof(*lc->clean_bits) << BYTE_SHIFT);
        bitset_size >>= BYTE_SHIFT;
-        lc->bitset_uint32_count = bitset_size / 4;
+        lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits);
-        lc->clean_bits = vmalloc(bitset_size);
-        if (!lc->clean_bits) {
+        /*
-                DMWARN("couldn't allocate clean bitset");
+         * Disk log?
-                kfree(lc);
+         */
-                return -ENOMEM;
+        if (!dev) {
+                lc->clean_bits = vmalloc(bitset_size);
+                if (!lc->clean_bits) {
+                        DMWARN("couldn't allocate clean bitset");
+                        kfree(lc);
+                        return -ENOMEM;
+                }
+                lc->disk_header = NULL;
+        } else {
+                lc->log_dev = dev;
+                lc->header_location.bdev = lc->log_dev->bdev;
+                lc->header_location.sector = 0;
+                /*
+                 * Buffer holds both header and bitset.
+                 */
+                buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
+                                       bitset_size, ti->limits.hardsect_size);
+                lc->header_location.count = buf_size >> SECTOR_SHIFT;
+                lc->disk_header = vmalloc(buf_size);
+                if (!lc->disk_header) {
+                        DMWARN("couldn't allocate disk log buffer");
+                        kfree(lc);
+                        return -ENOMEM;
+                }
+                lc->clean_bits = (void *)lc->disk_header +
+                                 (LOG_OFFSET << SECTOR_SHIFT);
        }
        memset(lc->clean_bits, -1, bitset_size);
        lc->sync_bits = vmalloc(bitset_size);
        if (!lc->sync_bits) {
                DMWARN("couldn't allocate sync bitset");
-                vfree(lc->clean_bits);
+                if (!dev)
+                        vfree(lc->clean_bits);
+                vfree(lc->disk_header);
                kfree(lc);
                return -ENOMEM;
        }
@@ -345,25 +354,40 @@ static int core_ctr(struct dirty_log *log, struct dm_target *ti,
        if (!lc->recovering_bits) {
                DMWARN("couldn't allocate sync bitset");
                vfree(lc->sync_bits);
-                vfree(lc->clean_bits);
+                if (!dev)
+                        vfree(lc->clean_bits);
+                vfree(lc->disk_header);
                kfree(lc);
                return -ENOMEM;
        }
        memset(lc->recovering_bits, 0, bitset_size);
        lc->sync_search = 0;
        log->context = lc;
        return 0;
 }
-static void core_dtr(struct dirty_log *log)
+static int core_ctr(struct dirty_log *log, struct dm_target *ti,
+                    unsigned int argc, char **argv)
+{
+        return create_log_context(log, ti, argc, argv, NULL);
+}
+static void destroy_log_context(struct log_c *lc)
 {
-        struct log_c *lc = (struct log_c *) log->context;
-        vfree(lc->clean_bits);
        vfree(lc->sync_bits);
        vfree(lc->recovering_bits);
        kfree(lc);
 }
+static void core_dtr(struct dirty_log *log)
+{
+        struct log_c *lc = (struct log_c *) log->context;
+        vfree(lc->clean_bits);
+        destroy_log_context(lc);
+}
 /*----------------------------------------------------------------
 * disk log constructor/destructor
 *
@@ -373,8 +397,6 @@ static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
                    unsigned int argc, char **argv)
 {
        int r;
-        size_t size;
-        struct log_c *lc;
        struct dm_dev *dev;
        if (argc < 2 || argc > 3) {
@@ -387,49 +409,22 @@ static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
        if (r)
                return r;
-        r = core_ctr(log, ti, argc - 1, argv + 1);
+        r = create_log_context(log, ti, argc - 1, argv + 1, dev);
        if (r) {
                dm_put_device(ti, dev);
                return r;
        }
-        lc = (struct log_c *) log->context;
-        lc->log_dev = dev;
-        /* setup the disk header fields */
-        lc->header_location.bdev = lc->log_dev->bdev;
-        lc->header_location.sector = 0;
-        lc->header_location.count = 1;
-        /*
-         * We can't read less than this amount, even though we'll
-         * not be using most of this space.
-         */
-        lc->disk_header = vmalloc(1 << SECTOR_SHIFT);
-        if (!lc->disk_header)
-                goto bad;
-        /* setup the disk bitset fields */
-        lc->bits_location.bdev = lc->log_dev->bdev;
-        lc->bits_location.sector = LOG_OFFSET;
-        size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t),
-                           1 << SECTOR_SHIFT);
-        lc->bits_location.count = size >> SECTOR_SHIFT;
        return 0;
- bad:
-        dm_put_device(ti, lc->log_dev);
-        core_dtr(log);
-        return -ENOMEM;
 }
 static void disk_dtr(struct dirty_log *log)
 {
        struct log_c *lc = (struct log_c *) log->context;
        dm_put_device(lc->ti, lc->log_dev);
        vfree(lc->disk_header);
-        core_dtr(log);
+        destroy_log_context(lc);
 }
 static int count_bits32(uint32_t *addr, unsigned size)
@@ -454,12 +449,7 @@ static int disk_resume(struct dirty_log *log)
        if (r)
                return r;
-        /* read the bits */
+        /* set or clear any new bits -- device has grown */
-        r = read_bits(lc);
-        if (r)
-                return r;
-        /* set or clear any new bits */
        if (lc->sync == NOSYNC)
                for (i = lc->header.nr_regions; i < lc->region_count; i++)
                        /* FIXME: amazingly inefficient */
@@ -469,15 +459,14 @@ static int disk_resume(struct dirty_log *log)
                        /* FIXME: amazingly inefficient */
                        log_clear_bit(lc, lc->clean_bits, i);
+        /* clear any old bits -- device has shrunk */
+        for (i = lc->region_count; i % (sizeof(*lc->clean_bits) << BYTE_SHIFT); i++)
+                log_clear_bit(lc, lc->clean_bits, i);
        /* copy clean across to sync */
        memcpy(lc->sync_bits, lc->clean_bits, size);
        lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
-        /* write the bits */
-        r = write_bits(lc);
-        if (r)
-                return r;
        /* set the correct number of regions in the header */
        lc->header.nr_regions = lc->region_count;
@@ -518,7 +507,7 @@ static int disk_flush(struct dirty_log *log)
        if (!lc->touched)
                return 0;
-        r = write_bits(lc);
+        r = write_header(lc);
        if (!r)
                lc->touched = 0;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 1816f30678ed..93f701ea87bc 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -21,6 +21,7 @@
 #include <linux/workqueue.h>
 #include <asm/atomic.h>
+#define DM_MSG_PREFIX "multipath"
 #define MESG_STR(x) x, sizeof(x)
 /* Path properties */
@@ -446,8 +447,6 @@ struct param {
        char *error;
 };
-#define ESTR(s) ("dm-multipath: " s)
 static int read_param(struct param *param, char *str, unsigned *v, char **error)
 {
        if (!str ||
@@ -495,12 +494,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
        unsigned ps_argc;
        static struct param _params[] = {
-                {0, 1024, ESTR("invalid number of path selector args")},
+                {0, 1024, "invalid number of path selector args"},
        };
        pst = dm_get_path_selector(shift(as));
        if (!pst) {
-                ti->error = ESTR("unknown path selector type");
+                ti->error = "unknown path selector type";
                return -EINVAL;
        }
@@ -511,7 +510,7 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
        r = pst->create(&pg->ps, ps_argc, as->argv);
        if (r) {
                dm_put_path_selector(pst);
-                ti->error = ESTR("path selector constructor failed");
+                ti->error = "path selector constructor failed";
                return r;
        }
@@ -529,7 +528,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
        /* we need at least a path arg */
        if (as->argc < 1) {
-                ti->error = ESTR("no device given");
+                ti->error = "no device given";
                return NULL;
        }
@@ -540,7 +539,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
        r = dm_get_device(ti, shift(as), ti->begin, ti->len,
                          dm_table_get_mode(ti->table), &p->path.dev);
        if (r) {
-                ti->error = ESTR("error getting device");
+                ti->error = "error getting device";
                goto bad;
        }
@@ -562,8 +561,8 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
                                                   struct dm_target *ti)
 {
        static struct param _params[] = {
-                {1, 1024, ESTR("invalid number of paths")},
+                {1, 1024, "invalid number of paths"},
-                {0, 1024, ESTR("invalid number of selector args")}
+                {0, 1024, "invalid number of selector args"}
        };
        int r;
@@ -572,13 +571,13 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
        if (as->argc < 2) {
                as->argc = 0;
-                ti->error = ESTR("not enough priority group aruments");
+                ti->error = "not enough priority group aruments";
                return NULL;
        }
        pg = alloc_priority_group();
        if (!pg) {
-                ti->error = ESTR("couldn't allocate priority group");
+                ti->error = "couldn't allocate priority group";
                return NULL;
        }
        pg->m = m;
@@ -633,7 +632,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m,
        unsigned hw_argc;
        static struct param _params[] = {
-                {0, 1024, ESTR("invalid number of hardware handler args")},
+                {0, 1024, "invalid number of hardware handler args"},
        };
        r = read_param(_params, shift(as), &hw_argc, &ti->error);
@@ -645,14 +644,14 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m,
        hwht = dm_get_hw_handler(shift(as));
        if (!hwht) {
-                ti->error = ESTR("unknown hardware handler type");
+                ti->error = "unknown hardware handler type";
                return -EINVAL;
        }
        r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
        if (r) {
                dm_put_hw_handler(hwht);
-                ti->error = ESTR("hardware handler constructor failed");
+                ti->error = "hardware handler constructor failed";
                return r;
        }
@@ -669,7 +668,7 @@ static int parse_features(struct arg_set *as, struct multipath *m,
        unsigned argc;
        static struct param _params[] = {
-                {0, 1, ESTR("invalid number of feature args")},
+                {0, 1, "invalid number of feature args"},
        };
        r = read_param(_params, shift(as), &argc, &ti->error);
@@ -692,8 +691,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 {
        /* target parameters */
        static struct param _params[] = {
-                {1, 1024, ESTR("invalid number of priority groups")},
+                {1, 1024, "invalid number of priority groups"},
-                {1, 1024, ESTR("invalid initial priority group number")},
+                {1, 1024, "invalid initial priority group number"},
        };
        int r;
@@ -707,10 +706,12 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
        m = alloc_multipath();
        if (!m) {
-                ti->error = ESTR("can't allocate multipath");
+                ti->error = "can't allocate multipath";
                return -EINVAL;
        }
+        m->ti = ti;
        r = parse_features(&as, m, ti);
        if (r)
                goto bad;
@@ -746,13 +747,12 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
        }
        if (pg_count != m->nr_priority_groups) {
-                ti->error = ESTR("priority group count mismatch");
+                ti->error = "priority group count mismatch";
                r = -EINVAL;
                goto bad;
        }
        ti->private = m;
-        m->ti = ti;
        return 0;
@@ -807,7 +807,7 @@ static int fail_path(struct pgpath *pgpath)
        if (!pgpath->path.is_active)
                goto out;
-        DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name);
+        DMWARN("Failing path %s.", pgpath->path.dev->name);
        pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
        pgpath->path.is_active = 0;
@@ -1250,7 +1250,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
        r = dm_get_device(ti, argv[1], ti->begin, ti->len,
                          dm_table_get_mode(ti->table), &dev);
        if (r) {
-                DMWARN("dm-multipath message: error getting device %s",
+                DMWARN("message: error getting device %s",
                       argv[1]);
                return -EINVAL;
        }
@@ -1309,7 +1309,7 @@ static int __init dm_multipath_init(void)
                return -ENOMEM;
        }
-        DMINFO("dm-multipath version %u.%u.%u loaded",
+        DMINFO("version %u.%u.%u loaded",
               multipath_target.version[0], multipath_target.version[1],
               multipath_target.version[2]);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d12cf3e5e076..c54de989eb00 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -20,6 +20,8 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
+#define DM_MSG_PREFIX "raid1"
 static struct workqueue_struct *_kmirrord_wq;
 static struct work_struct _kmirrord_work;
@@ -106,12 +108,42 @@ struct region {
        struct bio_list delayed_bios;
 };
+/*-----------------------------------------------------------------
+ * Mirror set structures.
+ *---------------------------------------------------------------*/
+struct mirror {
+        atomic_t error_count;
+        struct dm_dev *dev;
+        sector_t offset;
+};
+struct mirror_set {
+        struct dm_target *ti;
+        struct list_head list;
+        struct region_hash rh;
+        struct kcopyd_client *kcopyd_client;
+        spinlock_t lock;        /* protects the next two lists */
+        struct bio_list reads;
+        struct bio_list writes;
+        /* recovery */
+        region_t nr_regions;
+        int in_sync;
+        struct mirror *default_mirror;  /* Default mirror */
+        unsigned int nr_mirrors;
+        struct mirror mirror[0];
+};
 /*
 * Conversion fns
 */
 static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio)
 {
-        return bio->bi_sector >> rh->region_shift;
+        return (bio->bi_sector - rh->ms->ti->begin) >> rh->region_shift;
 }
 static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
@@ -223,7 +255,9 @@ static struct region *__rh_alloc(struct region_hash *rh, region_t region)
        struct region *reg, *nreg;
        read_unlock(&rh->hash_lock);
-        nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
+        nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
+        if (unlikely(!nreg))
+                nreg = kmalloc(sizeof(struct region), GFP_NOIO);
        nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
                RH_CLEAN : RH_NOSYNC;
        nreg->rh = rh;
@@ -458,11 +492,9 @@ static int __rh_recovery_prepare(struct region_hash *rh)
        /* Already quiesced ? */
        if (atomic_read(&reg->pending))
                list_del_init(&reg->list);
+        else
+                list_move(&reg->list, &rh->quiesced_regions);
-        else {
-                list_del_init(&reg->list);
-                list_add(&reg->list, &rh->quiesced_regions);
-        }
        spin_unlock_irq(&rh->region_lock);
        return 1;
@@ -541,35 +573,6 @@ static void rh_start_recovery(struct region_hash *rh)
        wake();
 }
-/*-----------------------------------------------------------------
- * Mirror set structures.
- *---------------------------------------------------------------*/
-struct mirror {
-        atomic_t error_count;
-        struct dm_dev *dev;
-        sector_t offset;
-};
-struct mirror_set {
-        struct dm_target *ti;
-        struct list_head list;
-        struct region_hash rh;
-        struct kcopyd_client *kcopyd_client;
-        spinlock_t lock;        /* protects the next two lists */
-        struct bio_list reads;
-        struct bio_list writes;
-        /* recovery */
-        region_t nr_regions;
-        int in_sync;
-        struct mirror *default_mirror;  /* Default mirror */
-        unsigned int nr_mirrors;
-        struct mirror mirror[0];
-};
 /*
 * Every mirror should look like this one.
 */
@@ -603,7 +606,7 @@ static void recovery_complete(int read_err, unsigned int write_err,
        struct region *reg = (struct region *) context;
        /* FIXME: better error handling */
-        rh_recovery_end(reg, read_err || write_err);
+        rh_recovery_end(reg, !(read_err || write_err));
 }
 static int recover(struct mirror_set *ms, struct region *reg)
@@ -893,7 +896,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
        ms = kmalloc(len, GFP_KERNEL);
        if (!ms) {
-                ti->error = "dm-mirror: Cannot allocate mirror context";
+                ti->error = "Cannot allocate mirror context";
                return NULL;
        }
@@ -907,7 +910,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
        ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
        if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
-                ti->error = "dm-mirror: Error creating dirty region hash";
+                ti->error = "Error creating dirty region hash";
                kfree(ms);
                return NULL;
        }
@@ -937,14 +940,14 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
        unsigned long long offset;
        if (sscanf(argv[1], "%llu", &offset) != 1) {
-                ti->error = "dm-mirror: Invalid offset";
+                ti->error = "Invalid offset";
                return -EINVAL;
        }
        if (dm_get_device(ti, argv[0], offset, ti->len,
                          dm_table_get_mode(ti->table),
                          &ms->mirror[mirror].dev)) {
-                ti->error = "dm-mirror: Device lookup failure";
+                ti->error = "Device lookup failure";
                return -ENXIO;
        }
@@ -981,30 +984,30 @@ static struct dirty_log *create_dirty_log(struct dm_target *ti,
        struct dirty_log *dl;
        if (argc < 2) {
-                ti->error = "dm-mirror: Insufficient mirror log arguments";
+                ti->error = "Insufficient mirror log arguments";
                return NULL;
        }
        if (sscanf(argv[1], "%u", &param_count) != 1) {
-                ti->error = "dm-mirror: Invalid mirror log argument count";
+                ti->error = "Invalid mirror log argument count";
                return NULL;
        }
        *args_used = 2 + param_count;
        if (argc < *args_used) {
-                ti->error = "dm-mirror: Insufficient mirror log arguments";
+                ti->error = "Insufficient mirror log arguments";
                return NULL;
        }
        dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2);
        if (!dl) {
-                ti->error = "dm-mirror: Error creating mirror dirty log";
+                ti->error = "Error creating mirror dirty log";
                return NULL;
        }
        if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
-                ti->error = "dm-mirror: Invalid region size";
+                ti->error = "Invalid region size";
                dm_destroy_dirty_log(dl);
                return NULL;
        }
@@ -1038,7 +1041,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
            nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) {
-                ti->error = "dm-mirror: Invalid number of mirrors";
+                ti->error = "Invalid number of mirrors";
                dm_destroy_dirty_log(dl);
                return -EINVAL;
        }
@@ -1046,7 +1049,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        argv++, argc--;
        if (argc != nr_mirrors * 2) {
-                ti->error = "dm-mirror: Wrong number of mirror arguments";
+                ti->error = "Wrong number of mirror arguments";
                dm_destroy_dirty_log(dl);
                return -EINVAL;
        }
@@ -1115,7 +1118,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
        struct mirror *m;
        struct mirror_set *ms = ti->private;
-        map_context->ll = bio->bi_sector >> ms->rh.region_shift;
+        map_context->ll = bio_to_region(&ms->rh, bio);
        if (rw == WRITE) {
                queue_bio(ms, bio, rw);
@@ -1221,7 +1224,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 static struct target_type mirror_target = {
        .name    = "mirror",
-        .version = {1, 0, 1},
+        .version = {1, 0, 2},
        .module  = THIS_MODULE,
        .ctr     = mirror_ctr,
        .dtr     = mirror_dtr,
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index d0024865a789..c5a16c550122 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -14,6 +14,8 @@
 #include <linux/slab.h>
+#define DM_MSG_PREFIX "multipath round-robin"
 /*-----------------------------------------------------------------
 * Path-handling code, paths are held in lists
 *---------------------------------------------------------------*/
@@ -191,9 +193,9 @@ static int __init dm_rr_init(void)
        int r = dm_register_path_selector(&rr_ps);
        if (r < 0)
-                DMERR("round-robin: register failed %d", r);
+                DMERR("register failed %d", r);
-        DMINFO("dm-round-robin version 1.0.0 loaded");
+        DMINFO("version 1.0.0 loaded");
        return r;
 }
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 08312b46463a..1d0fafda0f76 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -7,7 +7,6 @@
 */
 #include <linux/blkdev.h>
-#include <linux/config.h>
 #include <linux/ctype.h>
 #include <linux/device-mapper.h>
 #include <linux/fs.h>
@@ -23,6 +22,8 @@
 #include "dm-bio-list.h"
 #include "kcopyd.h"
+#define DM_MSG_PREFIX "snapshots"
 /*
 * The percentage increment we will wake up users at
 */
@@ -117,7 +118,7 @@ static int init_origin_hash(void)
        _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
                           GFP_KERNEL);
        if (!_origins) {
-                DMERR("Device mapper: Snapshot: unable to allocate memory");
+                DMERR("unable to allocate memory");
                return -ENOMEM;
        }
@@ -412,7 +413,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        int blocksize;
        if (argc < 4) {
-                ti->error = "dm-snapshot: requires exactly 4 arguments";
+                ti->error = "requires exactly 4 arguments";
                r = -EINVAL;
                goto bad1;
        }
@@ -530,7 +531,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
        ti->private = s;
-        ti->split_io = chunk_size;
+        ti->split_io = s->chunk_size;
        return 0;
@@ -1127,7 +1128,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        struct dm_dev *dev;
        if (argc != 1) {
-                ti->error = "dm-origin: incorrect number of arguments";
+                ti->error = "origin: incorrect number of arguments";
                return -EINVAL;
        }
@@ -1204,7 +1205,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result,
 static struct target_type origin_target = {
        .name    = "snapshot-origin",
-        .version = {1, 1, 0},
+        .version = {1, 4, 0},
        .module  = THIS_MODULE,
        .ctr     = origin_ctr,
        .dtr     = origin_dtr,
@@ -1215,7 +1216,7 @@ static struct target_type origin_target = {
 static struct target_type snapshot_target = {
        .name    = "snapshot",
-        .version = {1, 1, 0},
+        .version = {1, 4, 0},
        .module  = THIS_MODULE,
        .ctr     = snapshot_ctr,
        .dtr     = snapshot_dtr,
@@ -1236,7 +1237,7 @@ static int __init dm_snapshot_init(void)
        r = dm_register_target(&origin_target);
        if (r < 0) {
-                DMERR("Device mapper: Origin: register failed %d\n", r);
+                DMERR("Origin target register failed %d", r);
                goto bad1;
        }
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 08328a8f5a3c..6c29fcecd892 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -12,6 +12,8 @@
 #include <linux/bio.h>
 #include <linux/slab.h>
+#define DM_MSG_PREFIX "striped"
 struct stripe {
        struct dm_dev *dev;
        sector_t physical_start;
@@ -78,19 +80,19 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        unsigned int i;
        if (argc < 2) {
-                ti->error = "dm-stripe: Not enough arguments";
+                ti->error = "Not enough arguments";
                return -EINVAL;
        }
        stripes = simple_strtoul(argv[0], &end, 10);
        if (*end) {
-                ti->error = "dm-stripe: Invalid stripe count";
+                ti->error = "Invalid stripe count";
                return -EINVAL;
        }
        chunk_size = simple_strtoul(argv[1], &end, 10);
        if (*end) {
-                ti->error = "dm-stripe: Invalid chunk_size";
+                ti->error = "Invalid chunk_size";
                return -EINVAL;
        }
@@ -99,19 +101,19 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
         */
        if (!chunk_size || (chunk_size & (chunk_size - 1)) ||
            (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
-                ti->error = "dm-stripe: Invalid chunk size";
+                ti->error = "Invalid chunk size";
                return -EINVAL;
        }
        if (ti->len & (chunk_size - 1)) {
-                ti->error = "dm-stripe: Target length not divisible by "
+                ti->error = "Target length not divisible by "
                    "chunk size";
                return -EINVAL;
        }
        width = ti->len;
        if (sector_div(width, stripes)) {
-                ti->error = "dm-stripe: Target length not divisible by "
+                ti->error = "Target length not divisible by "
                    "number of stripes";
                return -EINVAL;
        }
@@ -120,14 +122,14 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
         * Do we have enough arguments for that many stripes ?
         */
        if (argc != (2 + 2 * stripes)) {
-                ti->error = "dm-stripe: Not enough destinations "
+                ti->error = "Not enough destinations "
                        "specified";
                return -EINVAL;
        }
        sc = alloc_context(stripes);
        if (!sc) {
-                ti->error = "dm-stripe: Memory allocation for striped context "
+                ti->error = "Memory allocation for striped context "
                    "failed";
                return -ENOMEM;
        }
@@ -149,8 +151,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                r = get_stripe(ti, sc, i, argv);
                if (r < 0) {
-                        ti->error = "dm-stripe: Couldn't parse stripe "
+                        ti->error = "Couldn't parse stripe destination";
-                                "destination";
                        while (i--)
                                dm_put_device(ti, sc->stripe[i].dev);
                        kfree(sc);
@@ -227,7 +228,7 @@ int __init dm_stripe_init(void)
        r = dm_register_target(&stripe_target);
        if (r < 0)
-                DMWARN("striped target registration failed");
+                DMWARN("target registration failed");
        return r;
 }
@@ -235,7 +236,7 @@ int __init dm_stripe_init(void)
 void dm_stripe_exit(void)
 {
        if (dm_unregister_target(&stripe_target))
-                DMWARN("striped target unregistration failed");
+                DMWARN("target unregistration failed");
        return;
 }
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8f56a54cf0ce..75fe9493e6af 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -17,6 +17,8 @@
 #include <linux/mutex.h>
 #include <asm/atomic.h>
+#define DM_MSG_PREFIX "table"
 #define MAX_DEPTH 16
 #define NODE_SIZE L1_CACHE_BYTES
 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
@@ -237,6 +239,44 @@ int dm_table_create(struct dm_table **result, int mode,
        return 0;
 }
+int dm_create_error_table(struct dm_table **result, struct mapped_device *md)
+{
+        struct dm_table *t;
+        sector_t dev_size = 1;
+        int r;
+        /*
+         * Find current size of device.
+         * Default to 1 sector if inactive.
+         */
+        t = dm_get_table(md);
+        if (t) {
+                dev_size = dm_table_get_size(t);
+                dm_table_put(t);
+        }
+        r = dm_table_create(&t, FMODE_READ, 1, md);
+        if (r)
+                return r;
+        r = dm_table_add_target(t, "error", 0, dev_size, NULL);
+        if (r)
+                goto out;
+        r = dm_table_complete(t);
+        if (r)
+                goto out;
+        *result = t;
+out:
+        if (r)
+                dm_table_put(t);
+        return r;
+}
+EXPORT_SYMBOL_GPL(dm_create_error_table);
 static void free_devices(struct list_head *devices)
 {
        struct list_head *tmp, *next;
@@ -590,6 +630,12 @@ int dm_split_args(int *argc, char ***argvp, char *input)
        unsigned array_size = 0;
        *argc = 0;
+        if (!input) {
+                *argvp = NULL;
+                return 0;
+        }
        argv = realloc_argv(&array_size, argv);
        if (!argv)
                return -ENOMEM;
@@ -671,15 +717,14 @@ int dm_table_add_target(struct dm_table *t, const char *type,
        memset(tgt, 0, sizeof(*tgt));
        if (!len) {
-                tgt->error = "zero-length target";
+                DMERR("%s: zero-length target", dm_device_name(t->md));
-                DMERR("%s", tgt->error);
                return -EINVAL;
        }
        tgt->type = dm_get_target_type(type);
        if (!tgt->type) {
-                tgt->error = "unknown target type";
+                DMERR("%s: %s: unknown target type", dm_device_name(t->md),
-                DMERR("%s", tgt->error);
+                      type);
                return -EINVAL;
        }
@@ -716,7 +761,7 @@ int dm_table_add_target(struct dm_table *t, const char *type,
        return 0;
 bad:
-        DMERR("%s", tgt->error);
+        DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error);
        dm_put_target_type(tgt->type);
        return r;
 }
@@ -802,7 +847,7 @@ sector_t dm_table_get_size(struct dm_table *t)
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
 {
-        if (index > t->num_targets)
+        if (index >= t->num_targets)
                return NULL;
        return t->targets + index;
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 64fd8e79ea4c..477a041a41cf 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -12,6 +12,8 @@
 #include <linux/bio.h>
 #include <linux/slab.h>
+#define DM_MSG_PREFIX "target"
 struct tt_internal {
        struct target_type tt;
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 51c0639b2487..ea569f7348d2 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -10,13 +10,15 @@
 #include <linux/init.h>
 #include <linux/bio.h>
+#define DM_MSG_PREFIX "zero"
 /*
 * Construct a dummy mapping that only returns zeros
 */
 static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
        if (argc != 0) {
-                ti->error = "dm-zero: No arguments required";
+                ti->error = "No arguments required";
                return -EINVAL;
        }
@@ -60,7 +62,7 @@ static int __init dm_zero_init(void)
        int r = dm_register_target(&zero_target);
        if (r < 0)
-                DMERR("zero: register failed %d", r);
+                DMERR("register failed %d", r);
        return r;
 }
@@ -70,7 +72,7 @@ static void __exit dm_zero_exit(void)
        int r = dm_unregister_target(&zero_target);
        if (r < 0)
-                DMERR("zero: unregister failed %d", r);
+                DMERR("unregister failed %d", r);
 }
 module_init(dm_zero_init)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4d710b7a133b..c99bf9f01759 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
- * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
 *
 * This file is released under the GPL.
 */
@@ -21,11 +21,14 @@
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
+#define DM_MSG_PREFIX "core"
 static const char *_name = DM_NAME;
 static unsigned int major = 0;
 static unsigned int _major = 0;
+static DEFINE_SPINLOCK(_minor_lock);
 /*
 * One of these is allocated per bio.
 */
@@ -49,23 +52,28 @@ struct target_io {
 union map_info *dm_get_mapinfo(struct bio *bio)
 {
-        if (bio && bio->bi_private)
+        if (bio && bio->bi_private)
-                return &((struct target_io *)bio->bi_private)->info;
+                return &((struct target_io *)bio->bi_private)->info;
-        return NULL;
+        return NULL;
 }
+#define MINOR_ALLOCED ((void *)-1)
 /*
 * Bits for the md->flags field.
 */
 #define DMF_BLOCK_IO 0
 #define DMF_SUSPENDED 1
 #define DMF_FROZEN 2
+#define DMF_FREEING 3
+#define DMF_DELETING 4
 struct mapped_device {
        struct rw_semaphore io_lock;
        struct semaphore suspend_lock;
        rwlock_t map_lock;
        atomic_t holders;
+        atomic_t open_count;
        unsigned long flags;
@@ -159,7 +167,7 @@ static void local_exit(void)
        bioset_free(dm_set);
        if (unregister_blkdev(_major, _name) < 0)
-                DMERR("devfs_unregister_blkdev failed");
+                DMERR("unregister_blkdev failed");
        _major = 0;
@@ -218,9 +226,25 @@ static int dm_blk_open(struct inode *inode, struct file *file)
 {
        struct mapped_device *md;
+        spin_lock(&_minor_lock);
        md = inode->i_bdev->bd_disk->private_data;
+        if (!md)
+                goto out;
+        if (test_bit(DMF_FREEING, &md->flags) ||
+            test_bit(DMF_DELETING, &md->flags)) {
+                md = NULL;
+                goto out;
+        }
        dm_get(md);
-        return 0;
+        atomic_inc(&md->open_count);
+out:
+        spin_unlock(&_minor_lock);
+        return md ? 0 : -ENXIO;
 }
 static int dm_blk_close(struct inode *inode, struct file *file)
@@ -228,10 +252,35 @@ static int dm_blk_close(struct inode *inode, struct file *file)
        struct mapped_device *md;
        md = inode->i_bdev->bd_disk->private_data;
+        atomic_dec(&md->open_count);
        dm_put(md);
        return 0;
 }
+int dm_open_count(struct mapped_device *md)
+{
+        return atomic_read(&md->open_count);
+}
+/*
+ * Guarantees nothing is using the device before it's deleted.
+ */
+int dm_lock_for_deletion(struct mapped_device *md)
+{
+        int r = 0;
+        spin_lock(&_minor_lock);
+        if (dm_open_count(md))
+                r = -EBUSY;
+        else
+                set_bit(DMF_DELETING, &md->flags);
+        spin_unlock(&_minor_lock);
+        return r;
+}
 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
        struct mapped_device *md = bdev->bd_disk->private_data;
@@ -456,8 +505,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
        if (r > 0) {
                /* the bio has been remapped so dispatch it */
-                blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 
+                blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
-                                    tio->io->bio->bi_bdev->bd_dev, sector, 
+                                    tio->io->bio->bi_bdev->bd_dev, sector,
                                    clone->bi_sector);
                generic_make_request(clone);
@@ -744,43 +793,39 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 /*-----------------------------------------------------------------
 * An IDR is used to keep track of allocated minor numbers.
 *---------------------------------------------------------------*/
-static DEFINE_MUTEX(_minor_lock);
 static DEFINE_IDR(_minor_idr);
-static void free_minor(unsigned int minor)
+static void free_minor(int minor)
 {
-        mutex_lock(&_minor_lock);
+        spin_lock(&_minor_lock);
        idr_remove(&_minor_idr, minor);
-        mutex_unlock(&_minor_lock);
+        spin_unlock(&_minor_lock);
 }
 /*
 * See if the device with a specific minor # is free.
 */
-static int specific_minor(struct mapped_device *md, unsigned int minor)
+static int specific_minor(struct mapped_device *md, int minor)
 {
        int r, m;
        if (minor >= (1 << MINORBITS))
                return -EINVAL;
-        mutex_lock(&_minor_lock);
+        r = idr_pre_get(&_minor_idr, GFP_KERNEL);
+        if (!r)
+                return -ENOMEM;
+        spin_lock(&_minor_lock);
        if (idr_find(&_minor_idr, minor)) {
                r = -EBUSY;
                goto out;
        }
-        r = idr_pre_get(&_minor_idr, GFP_KERNEL);
+        r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
-        if (!r) {
+        if (r)
-                r = -ENOMEM;
-                goto out;
-        }
-        r = idr_get_new_above(&_minor_idr, md, minor, &m);
-        if (r) {
                goto out;
-        }
        if (m != minor) {
                idr_remove(&_minor_idr, m);
@@ -789,24 +834,21 @@ static int specific_minor(struct mapped_device *md, unsigned int minor)
        }
 out:
-        mutex_unlock(&_minor_lock);
+        spin_unlock(&_minor_lock);
        return r;
 }
-static int next_free_minor(struct mapped_device *md, unsigned int *minor)
+static int next_free_minor(struct mapped_device *md, int *minor)
 {
-        int r;
+        int r, m;
-        unsigned int m;
-        mutex_lock(&_minor_lock);
        r = idr_pre_get(&_minor_idr, GFP_KERNEL);
-        if (!r) {
+        if (!r)
-                r = -ENOMEM;
+                return -ENOMEM;
-                goto out;
-        }
+        spin_lock(&_minor_lock);
-        r = idr_get_new(&_minor_idr, md, &m);
+        r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
        if (r) {
                goto out;
        }
@@ -820,7 +862,7 @@ static int next_free_minor(struct mapped_device *md, unsigned int *minor)
        *minor = m;
 out:
-        mutex_unlock(&_minor_lock);
+        spin_unlock(&_minor_lock);
        return r;
 }
@@ -829,18 +871,25 @@ static struct block_device_operations dm_blk_dops;
 /*
 * Allocate and initialise a blank device with a given minor.
 */
-static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
+static struct mapped_device *alloc_dev(int minor)
 {
        int r;
        struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
+        void *old_md;
        if (!md) {
                DMWARN("unable to allocate device, out of memory.");
                return NULL;
        }
+        if (!try_module_get(THIS_MODULE))
+                goto bad0;
        /* get a minor number for the dev */
-        r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor);
+        if (minor == DM_ANY_MINOR)
+                r = next_free_minor(md, &minor);
+        else
+                r = specific_minor(md, minor);
        if (r < 0)
                goto bad1;
@@ -849,6 +898,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
        init_MUTEX(&md->suspend_lock);
        rwlock_init(&md->map_lock);
        atomic_set(&md->holders, 1);
+        atomic_set(&md->open_count, 0);
        atomic_set(&md->event_nr, 0);
        md->queue = blk_alloc_queue(GFP_KERNEL);
@@ -875,6 +925,10 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
        if (!md->disk)
                goto bad4;
+        atomic_set(&md->pending, 0);
+        init_waitqueue_head(&md->wait);
+        init_waitqueue_head(&md->eventq);
        md->disk->major = _major;
        md->disk->first_minor = minor;
        md->disk->fops = &dm_blk_dops;
@@ -884,9 +938,12 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
        add_disk(md->disk);
        format_dev_t(md->name, MKDEV(_major, minor));
-        atomic_set(&md->pending, 0);
+        /* Populate the mapping, nobody knows we exist yet */
-        init_waitqueue_head(&md->wait);
+        spin_lock(&_minor_lock);
-        init_waitqueue_head(&md->eventq);
+        old_md = idr_replace(&_minor_idr, md, minor);
+        spin_unlock(&_minor_lock);
+        BUG_ON(old_md != MINOR_ALLOCED);
        return md;
@@ -898,13 +955,15 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
        blk_cleanup_queue(md->queue);
        free_minor(minor);
 bad1:
+        module_put(THIS_MODULE);
+ bad0:
        kfree(md);
        return NULL;
 }
 static void free_dev(struct mapped_device *md)
 {
-        unsigned int minor = md->disk->first_minor;
+        int minor = md->disk->first_minor;
        if (md->suspended_bdev) {
                thaw_bdev(md->suspended_bdev, NULL);
@@ -914,8 +973,14 @@ static void free_dev(struct mapped_device *md)
        mempool_destroy(md->io_pool);
        del_gendisk(md->disk);
        free_minor(minor);
+        spin_lock(&_minor_lock);
+        md->disk->private_data = NULL;
+        spin_unlock(&_minor_lock);
        put_disk(md->disk);
        blk_cleanup_queue(md->queue);
+        module_put(THIS_MODULE);
        kfree(md);
 }
@@ -984,12 +1049,11 @@ static void __unbind(struct mapped_device *md)
 /*
 * Constructor for a new device.
 */
-static int create_aux(unsigned int minor, int persistent,
+int dm_create(int minor, struct mapped_device **result)
-                      struct mapped_device **result)
 {
        struct mapped_device *md;
-        md = alloc_dev(minor, persistent);
+        md = alloc_dev(minor);
        if (!md)
                return -ENXIO;
@@ -997,16 +1061,6 @@ static int create_aux(unsigned int minor, int persistent,
        return 0;
 }
-int dm_create(struct mapped_device **result)
-{
-        return create_aux(0, 0, result);
-}
-int dm_create_with_minor(unsigned int minor, struct mapped_device **result)
-{
-        return create_aux(minor, 1, result);
-}
 static struct mapped_device *dm_find_md(dev_t dev)
 {
        struct mapped_device *md;
@@ -1015,13 +1069,18 @@ static struct mapped_device *dm_find_md(dev_t dev)
        if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
                return NULL;
-        mutex_lock(&_minor_lock);
+        spin_lock(&_minor_lock);
        md = idr_find(&_minor_idr, minor);
-        if (!md || (dm_disk(md)->first_minor != minor))
+        if (md && (md == MINOR_ALLOCED ||
+                   (dm_disk(md)->first_minor != minor) ||
+                   test_bit(DMF_FREEING, &md->flags))) {
                md = NULL;
+                goto out;
+        }
-        mutex_unlock(&_minor_lock);
+out:
+        spin_unlock(&_minor_lock);
        return md;
 }
@@ -1051,12 +1110,23 @@ void dm_get(struct mapped_device *md)
        atomic_inc(&md->holders);
 }
+const char *dm_device_name(struct mapped_device *md)
+{
+        return md->name;
+}
+EXPORT_SYMBOL_GPL(dm_device_name);
 void dm_put(struct mapped_device *md)
 {
        struct dm_table *map;
-        if (atomic_dec_and_test(&md->holders)) {
+        BUG_ON(test_bit(DMF_FREEING, &md->flags));
+        if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
                map = dm_get_table(md);
+                idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor);
+                set_bit(DMF_FREEING, &md->flags);
+                spin_unlock(&_minor_lock);
                if (!dm_suspended(md)) {
                        dm_table_presuspend_targets(map);
                        dm_table_postsuspend_targets(map);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index fd90bc8f9e45..3c03c0ecab7e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -2,7 +2,7 @@
 * Internal header file for device mapper
 *
 * Copyright (C) 2001, 2002 Sistina Software
- * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
 *
 * This file is released under the LGPL.
 */
@@ -17,9 +17,10 @@
 #include <linux/hdreg.h>
 #define DM_NAME "device-mapper"
-#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
-#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
+#define DMERR(f, arg...) printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
-#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
+#define DMWARN(f, arg...) printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
+#define DMINFO(f, arg...) printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
 #define DMEMIT(x...) sz += ((sz >= maxlen) ? \
                          0 : scnprintf(result + sz, maxlen - sz, x))
@@ -39,83 +40,16 @@ struct dm_dev {
 };
 struct dm_table;
-struct mapped_device;
-/*-----------------------------------------------------------------
- * Functions for manipulating a struct mapped_device.
- * Drop the reference with dm_put when you finish with the object.
- *---------------------------------------------------------------*/
-int dm_create(struct mapped_device **md);
-int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
-void dm_set_mdptr(struct mapped_device *md, void *ptr);
-void *dm_get_mdptr(struct mapped_device *md);
-struct mapped_device *dm_get_md(dev_t dev);
-/*
- * Reference counting for md.
- */
-void dm_get(struct mapped_device *md);
-void dm_put(struct mapped_device *md);
-/*
- * A device can still be used while suspended, but I/O is deferred.
- */
-int dm_suspend(struct mapped_device *md, int with_lockfs);
-int dm_resume(struct mapped_device *md);
-/*
- * The device must be suspended before calling this method.
- */
-int dm_swap_table(struct mapped_device *md, struct dm_table *t);
-/*
- * Drop a reference on the table when you've finished with the
- * result.
- */
-struct dm_table *dm_get_table(struct mapped_device *md);
-/*
- * Event functions.
- */
-uint32_t dm_get_event_nr(struct mapped_device *md);
-int dm_wait_event(struct mapped_device *md, int event_nr);
-/*
- * Info functions.
- */
-struct gendisk *dm_disk(struct mapped_device *md);
-int dm_suspended(struct mapped_device *md);
-/*
- * Geometry functions.
- */
-int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo);
-int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);
 /*-----------------------------------------------------------------
- * Functions for manipulating a table.  Tables are also reference
+ * Internal table functions.
- * counted.
 *---------------------------------------------------------------*/
-int dm_table_create(struct dm_table **result, int mode,
-                    unsigned num_targets, struct mapped_device *md);
-void dm_table_get(struct dm_table *t);
-void dm_table_put(struct dm_table *t);
-int dm_table_add_target(struct dm_table *t, const char *type,
-                        sector_t start, sector_t len, char *params);
-int dm_table_complete(struct dm_table *t);
 void dm_table_event_callback(struct dm_table *t,
                             void (*fn)(void *), void *context);
-void dm_table_event(struct dm_table *t);
-sector_t dm_table_get_size(struct dm_table *t);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
-unsigned int dm_table_get_num_targets(struct dm_table *t);
 struct list_head *dm_table_get_devices(struct dm_table *t);
-int dm_table_get_mode(struct dm_table *t);
-struct mapped_device *dm_table_get_md(struct dm_table *t);
 void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 void dm_table_resume_targets(struct dm_table *t);
@@ -133,7 +67,6 @@ void dm_put_target_type(struct target_type *t);
 int dm_target_iterate(void (*iter_func)(struct target_type *tt,
                                        void *param), void *param);
 /*-----------------------------------------------------------------
 * Useful inlines.
 *---------------------------------------------------------------*/
@@ -191,5 +124,7 @@ void dm_stripe_exit(void);
 void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
 union map_info *dm_get_mapinfo(struct bio *bio);
+int dm_open_count(struct mapped_device *md);
+int dm_lock_for_deletion(struct mapped_device *md);
 #endif
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index 72480a48d88b..f1db6eff4857 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -12,7 +12,6 @@
 #include <asm/atomic.h>
 #include <linux/blkdev.h>
-#include <linux/config.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/list.h>
@@ -314,7 +313,7 @@ static void complete_io(unsigned long error, void *context)
        if (error) {
                if (job->rw == WRITE)
-                        job->write_err &= error;
+                        job->write_err |= error;
                else
                        job->read_err = 1;
@@ -460,7 +459,7 @@ static void segment_complete(int read_err,
                job->read_err = 1;
        if (write_err)
-                job->write_err &= write_err;
+                job->write_err |= write_err;
        /*
         * Only dispatch more work if there hasn't been an error.
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 777585458c85..b99c19c7eb22 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -111,7 +111,7 @@ static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
        return ret;
 }
-static int linear_run (mddev_t *mddev)
+static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 {
        linear_conf_t *conf;
        dev_info_t **table;
@@ -121,20 +121,21 @@ static int linear_run (mddev_t *mddev)
        sector_t curr_offset;
        struct list_head *tmp;
-        conf = kzalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t),
+        conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
                        GFP_KERNEL);
        if (!conf)
-                goto out;
+                return NULL;
        mddev->private = conf;
        cnt = 0;
-        mddev->array_size = 0;
+        conf->array_size = 0;
        ITERATE_RDEV(mddev,rdev,tmp) {
                int j = rdev->raid_disk;
                dev_info_t *disk = conf->disks + j;
-                if (j < 0 || j > mddev->raid_disks || disk->rdev) {
+                if (j < 0 || j > raid_disks || disk->rdev) {
                        printk("linear: disk numbering problem. Aborting!\n");
                        goto out;
                }
@@ -152,16 +153,16 @@ static int linear_run (mddev_t *mddev)
                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
                disk->size = rdev->size;
-                mddev->array_size += rdev->size;
+                conf->array_size += rdev->size;
                cnt++;
        }
-        if (cnt != mddev->raid_disks) {
+        if (cnt != raid_disks) {
                printk("linear: not enough drives present. Aborting!\n");
                goto out;
        }
-        min_spacing = mddev->array_size;
+        min_spacing = conf->array_size;
        sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
        /* min_spacing is the minimum spacing that will fit the hash
@@ -170,7 +171,7 @@ static int linear_run (mddev_t *mddev)
         * that is larger than min_spacing as use the size of that as
         * the actual spacing
         */
-        conf->hash_spacing = mddev->array_size;
+        conf->hash_spacing = conf->array_size;
        for (i=0; i < cnt-1 ; i++) {
                sector_t sz = 0;
                int j;
@@ -200,7 +201,7 @@ static int linear_run (mddev_t *mddev)
                unsigned round;
                unsigned long base;
-                sz = mddev->array_size >> conf->preshift;
+                sz = conf->array_size >> conf->preshift;
                sz += 1; /* force round-up */
                base = conf->hash_spacing >> conf->preshift;
                round = sector_div(sz, base);
@@ -227,7 +228,7 @@ static int linear_run (mddev_t *mddev)
        curr_offset = 0;
        i = 0;
        for (curr_offset = 0;
-             curr_offset < mddev->array_size;
+             curr_offset < conf->array_size;
             curr_offset += conf->hash_spacing) {
                while (i < mddev->raid_disks-1 &&
@@ -247,14 +248,56 @@ static int linear_run (mddev_t *mddev)
        BUG_ON(table - conf->hash_table > nb_zone);
+        return conf;
+out:
+        kfree(conf);
+        return NULL;
+}
+static int linear_run (mddev_t *mddev)
+{
+        linear_conf_t *conf;
+        conf = linear_conf(mddev, mddev->raid_disks);
+        if (!conf)
+                return 1;
+        mddev->private = conf;
+        mddev->array_size = conf->array_size;
        blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
        mddev->queue->unplug_fn = linear_unplug;
        mddev->queue->issue_flush_fn = linear_issue_flush;
        return 0;
+}
-out:
+static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
-        kfree(conf);
+{
-        return 1;
+        /* Adding a drive to a linear array allows the array to grow.
+         * It is permitted if the new drive has a matching superblock
+         * already on it, with raid_disk equal to raid_disks.
+         * It is achieved by creating a new linear_private_data structure
+         * and swapping it in in-place of the current one.
+         * The current one is never freed until the array is stopped.
+         * This avoids races.
+         */
+        linear_conf_t *newconf;
+        if (rdev->raid_disk != mddev->raid_disks)
+                return -EINVAL;
+        newconf = linear_conf(mddev,mddev->raid_disks+1);
+        if (!newconf)
+                return -ENOMEM;
+        newconf->prev = mddev_to_conf(mddev);
+        mddev->private = newconf;
+        mddev->raid_disks++;
+        mddev->array_size = newconf->array_size;
+        set_capacity(mddev->gendisk, mddev->array_size << 1);
+        return 0;
 }
 static int linear_stop (mddev_t *mddev)
@@ -262,8 +305,12 @@ static int linear_stop (mddev_t *mddev)
        linear_conf_t *conf = mddev_to_conf(mddev);
  
        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
-        kfree(conf->hash_table);
+        do {
-        kfree(conf);
+                linear_conf_t *t = conf->prev;
+                kfree(conf->hash_table);
+                kfree(conf);
+                conf = t;
+        } while (conf);
        return 0;
 }
@@ -360,6 +407,7 @@ static struct mdk_personality linear_personality =
        .run            = linear_run,
        .stop           = linear_stop,
        .status         = linear_status,
+        .hot_add_disk   = linear_add,
 };
 static int __init linear_init (void)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f19b874753a9..8dbab2ef3885 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,17 +33,16 @@
 */
 #include <linux/module.h>
-#include <linux/config.h>
 #include <linux/kthread.h>
 #include <linux/linkage.h>
 #include <linux/raid/md.h>
 #include <linux/raid/bitmap.h>
 #include <linux/sysctl.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/suspend.h>
 #include <linux/poll.h>
 #include <linux/mutex.h>
+#include <linux/ctype.h>
 #include <linux/init.h>
@@ -72,6 +71,10 @@ static void autostart_arrays (int part);
 static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
+static void md_print_devices(void);
+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 /*
 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
 * is 1000 KB/sec, so the extra system load does not show up that much.
@@ -107,7 +110,7 @@ static ctl_table raid_table[] = {
                .procname       = "speed_limit_min",
                .data           = &sysctl_speed_limit_min,
                .maxlen         = sizeof(int),
-                .mode           = 0644,
+                .mode           = S_IRUGO|S_IWUSR,
                .proc_handler   = &proc_dointvec,
        },
        {
@@ -115,7 +118,7 @@ static ctl_table raid_table[] = {
                .procname       = "speed_limit_max",
                .data           = &sysctl_speed_limit_max,
                .maxlen         = sizeof(int),
-                .mode           = 0644,
+                .mode           = S_IRUGO|S_IWUSR,
                .proc_handler   = &proc_dointvec,
        },
        { .ctl_name = 0 }
@@ -126,7 +129,7 @@ static ctl_table raid_dir_table[] = {
                .ctl_name       = DEV_RAID,
                .procname       = "raid",
                .maxlen         = 0,
-                .mode           = 0555,
+                .mode           = S_IRUGO|S_IXUGO,
                .child          = raid_table,
        },
        { .ctl_name = 0 }
@@ -170,7 +173,7 @@ EXPORT_SYMBOL_GPL(md_new_event);
 /* Alternate version that can be called from interrupts
 * when calling sysfs_notify isn't needed.
 */
-void md_new_event_inintr(mddev_t *mddev)
+static void md_new_event_inintr(mddev_t *mddev)
 {
        atomic_inc(&md_event_count);
        wake_up(&md_event_waiters);
@@ -732,6 +735,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        mdp_disk_t *desc;
        mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+        __u64 ev1 = md_event(sb);
        rdev->raid_disk = -1;
        rdev->flags = 0;
@@ -748,7 +752,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->layout = sb->layout;
                mddev->raid_disks = sb->raid_disks;
                mddev->size = sb->size;
-                mddev->events = md_event(sb);
+                mddev->events = ev1;
                mddev->bitmap_offset = 0;
                mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
@@ -797,7 +801,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
        } else if (mddev->pers == NULL) {
                /* Insist on good event counter while assembling */
-                __u64 ev1 = md_event(sb);
                ++ev1;
                if (ev1 < mddev->events) 
                        return -EINVAL;
@@ -805,19 +808,21 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                /* if adding to array with a bitmap, then we can accept an
                 * older device ... but not too old.
                 */
-                __u64 ev1 = md_event(sb);
                if (ev1 < mddev->bitmap->events_cleared)
                        return 0;
-        } else /* just a hot-add of a new device, leave raid_disk at -1 */
+        } else {
-                return 0;
+                if (ev1 < mddev->events)
+                        /* just a hot-add of a new device, leave raid_disk at -1 */
+                        return 0;
+        }
        if (mddev->level != LEVEL_MULTIPATH) {
                desc = sb->disks + rdev->desc_nr;
                if (desc->state & (1<<MD_DISK_FAULTY))
                        set_bit(Faulty, &rdev->flags);
-                else if (desc->state & (1<<MD_DISK_SYNC) &&
+                else if (desc->state & (1<<MD_DISK_SYNC) /* &&
-                         desc->raid_disk < mddev->raid_disks) {
+                            desc->raid_disk < mddev->raid_disks */) {
                        set_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = desc->raid_disk;
                }
@@ -1057,6 +1062,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
        if (rdev->sb_size & bmask)
                rdev-> sb_size = (rdev->sb_size | bmask)+1;
+        if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
+                rdev->desc_nr = -1;
+        else
+                rdev->desc_nr = le32_to_cpu(sb->dev_number);
        if (refdev == 0)
                ret = 1;
        else {
@@ -1100,6 +1110,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+        __u64 ev1 = le64_to_cpu(sb->events);
        rdev->raid_disk = -1;
        rdev->flags = 0;
@@ -1115,7 +1126,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->layout = le32_to_cpu(sb->layout);
                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
                mddev->size = le64_to_cpu(sb->size)/2;
-                mddev->events = le64_to_cpu(sb->events);
+                mddev->events = ev1;
                mddev->bitmap_offset = 0;
                mddev->default_bitmap_offset = 1024 >> 9;
                
@@ -1149,7 +1160,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
        } else if (mddev->pers == NULL) {
                /* Insist of good event counter while assembling */
-                __u64 ev1 = le64_to_cpu(sb->events);
                ++ev1;
                if (ev1 < mddev->events)
                        return -EINVAL;
@@ -1157,15 +1167,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                /* If adding to array with a bitmap, then we can accept an
                 * older device, but not too old.
                 */
-                __u64 ev1 = le64_to_cpu(sb->events);
                if (ev1 < mddev->bitmap->events_cleared)
                        return 0;
-        } else /* just a hot-add of a new device, leave raid_disk at -1 */
+        } else {
-                return 0;
+                if (ev1 < mddev->events)
+                        /* just a hot-add of a new device, leave raid_disk at -1 */
+                        return 0;
+        }
        if (mddev->level != LEVEL_MULTIPATH) {
                int role;
-                rdev->desc_nr = le32_to_cpu(sb->dev_number);
                role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
                switch(role) {
                case 0xffff: /* spare */
@@ -1174,7 +1184,11 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                        set_bit(Faulty, &rdev->flags);
                        break;
                default:
-                        set_bit(In_sync, &rdev->flags);
+                        if ((le32_to_cpu(sb->feature_map) &
+                             MD_FEATURE_RECOVERY_OFFSET))
+                                rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+                        else
+                                set_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = role;
                        break;
                }
@@ -1198,6 +1212,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        sb->feature_map = 0;
        sb->pad0 = 0;
+        sb->recovery_offset = cpu_to_le64(0);
        memset(sb->pad1, 0, sizeof(sb->pad1));
        memset(sb->pad2, 0, sizeof(sb->pad2));
        memset(sb->pad3, 0, sizeof(sb->pad3));
@@ -1218,6 +1233,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
        }
+        if (rdev->raid_disk >= 0 &&
+            !test_bit(In_sync, &rdev->flags) &&
+            rdev->recovery_offset > 0) {
+                sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+                sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
+        }
        if (mddev->reshape_position != MaxSector) {
                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
                sb->reshape_position = cpu_to_le64(mddev->reshape_position);
@@ -1242,11 +1265,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
                else if (test_bit(In_sync, &rdev2->flags))
                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+                else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
+                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
                else
                        sb->dev_roles[i] = cpu_to_le16(0xffff);
        }
-        sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
        sb->sb_csum = calc_sb_1_csum(sb);
 }
@@ -1384,7 +1408,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
-        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+        bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE);
        if (IS_ERR(bdev)) {
                printk(KERN_ERR "md: could not open %s.\n",
                        __bdevname(dev, b));
@@ -1394,7 +1418,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
        if (err) {
                printk(KERN_ERR "md: could not bd_claim %s.\n",
                        bdevname(bdev, b));
-                blkdev_put(bdev);
+                blkdev_put_partition(bdev);
                return err;
        }
        rdev->bdev = bdev;
@@ -1408,7 +1432,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
        if (!bdev)
                MD_BUG();
        bd_release(bdev);
-        blkdev_put(bdev);
+        blkdev_put_partition(bdev);
 }
 void md_autodetect_dev(dev_t dev);
@@ -1507,7 +1531,7 @@ static void print_rdev(mdk_rdev_t *rdev)
                printk(KERN_INFO "md: no rdev superblock!\n");
 }
-void md_print_devices(void)
+static void md_print_devices(void)
 {
        struct list_head *tmp, *tmp2;
        mdk_rdev_t *rdev;
@@ -1536,15 +1560,30 @@ void md_print_devices(void)
 }
-static void sync_sbs(mddev_t * mddev)
+static void sync_sbs(mddev_t * mddev, int nospares)
 {
+        /* Update each superblock (in-memory image), but
+         * if we are allowed to, skip spares which already
+         * have the right event counter, or have one earlier
+         * (which would mean they aren't being marked as dirty
+         * with the rest of the array)
+         */
        mdk_rdev_t *rdev;
        struct list_head *tmp;
        ITERATE_RDEV(mddev,rdev,tmp) {
-                super_types[mddev->major_version].
+                if (rdev->sb_events == mddev->events ||
-                        sync_super(mddev, rdev);
+                    (nospares &&
-                rdev->sb_loaded = 1;
+                     rdev->raid_disk < 0 &&
+                     (rdev->sb_events&1)==0 &&
+                     rdev->sb_events+1 == mddev->events)) {
+                        /* Don't update this superblock */
+                        rdev->sb_loaded = 2;
+                } else {
+                        super_types[mddev->major_version].
+                                sync_super(mddev, rdev);
+                        rdev->sb_loaded = 1;
+                }
        }
 }
@@ -1554,12 +1593,55 @@ void md_update_sb(mddev_t * mddev)
        struct list_head *tmp;
        mdk_rdev_t *rdev;
        int sync_req;
+        int nospares = 0;
 repeat:
        spin_lock_irq(&mddev->write_lock);
+        if (mddev->degraded && mddev->sb_dirty == 3)
+                /* If the array is degraded, then skipping spares is both
+                 * dangerous and fairly pointless.
+                 * Dangerous because a device that was removed from the array
+                 * might have a event_count that still looks up-to-date,
+                 * so it can be re-added without a resync.
+                 * Pointless because if there are any spares to skip,
+                 * then a recovery will happen and soon that array won't
+                 * be degraded any more and the spare can go back to sleep then.
+                 */
+                mddev->sb_dirty = 1;
        sync_req = mddev->in_sync;
        mddev->utime = get_seconds();
-        mddev->events ++;
+        if (mddev->sb_dirty == 3)
+                /* just a clean<-> dirty transition, possibly leave spares alone,
+                 * though if events isn't the right even/odd, we will have to do
+                 * spares after all
+                 */
+                nospares = 1;
+        /* If this is just a dirty<->clean transition, and the array is clean
+         * and 'events' is odd, we can roll back to the previous clean state */
+        if (mddev->sb_dirty == 3
+            && (mddev->in_sync && mddev->recovery_cp == MaxSector)
+            && (mddev->events & 1))
+                mddev->events--;
+        else {
+                /* otherwise we have to go forward and ... */
+                mddev->events ++;
+                if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
+                        /* .. if the array isn't clean, insist on an odd 'events' */
+                        if ((mddev->events&1)==0) {
+                                mddev->events++;
+                                nospares = 0;
+                        }
+                } else {
+                        /* otherwise insist on an even 'events' (for clean states) */
+                        if ((mddev->events&1)) {
+                                mddev->events++;
+                                nospares = 0;
+                        }
+                }
+        }
        if (!mddev->events) {
                /*
@@ -1571,7 +1653,7 @@ repeat:
                mddev->events --;
        }
        mddev->sb_dirty = 2;
-        sync_sbs(mddev);
+        sync_sbs(mddev, nospares);
        /*
         * do not write anything to disk if using
@@ -1593,6 +1675,8 @@ repeat:
        ITERATE_RDEV(mddev,rdev,tmp) {
                char b[BDEVNAME_SIZE];
                dprintk(KERN_INFO "md: ");
+                if (rdev->sb_loaded != 1)
+                        continue; /* no noise on spare devices */
                if (test_bit(Faulty, &rdev->flags))
                        dprintk("(skipping faulty ");
@@ -1604,6 +1688,7 @@ repeat:
                        dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
                                bdevname(rdev->bdev,b),
                                (unsigned long long)rdev->sb_offset);
+                        rdev->sb_events = mddev->events;
                } else
                        dprintk(")\n");
@@ -1667,6 +1752,10 @@ state_show(mdk_rdev_t *rdev, char *page)
                len += sprintf(page+len, "%sin_sync",sep);
                sep = ",";
        }
+        if (test_bit(WriteMostly, &rdev->flags)) {
+                len += sprintf(page+len, "%swrite_mostly",sep);
+                sep = ",";
+        }
        if (!test_bit(Faulty, &rdev->flags) &&
            !test_bit(In_sync, &rdev->flags)) {
                len += sprintf(page+len, "%sspare", sep);
@@ -1675,8 +1764,40 @@ state_show(mdk_rdev_t *rdev, char *page)
        return len+sprintf(page+len, "\n");
 }
-static struct rdev_sysfs_entry
+static ssize_t
-rdev_state = __ATTR_RO(state);
+state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+        /* can write
+         *  faulty  - simulates and error
+         *  remove  - disconnects the device
+         *  writemostly - sets write_mostly
+         *  -writemostly - clears write_mostly
+         */
+        int err = -EINVAL;
+        if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
+                md_error(rdev->mddev, rdev);
+                err = 0;
+        } else if (cmd_match(buf, "remove")) {
+                if (rdev->raid_disk >= 0)
+                        err = -EBUSY;
+                else {
+                        mddev_t *mddev = rdev->mddev;
+                        kick_rdev_from_array(rdev);
+                        md_update_sb(mddev);
+                        md_new_event(mddev);
+                        err = 0;
+                }
+        } else if (cmd_match(buf, "writemostly")) {
+                set_bit(WriteMostly, &rdev->flags);
+                err = 0;
+        } else if (cmd_match(buf, "-writemostly")) {
+                clear_bit(WriteMostly, &rdev->flags);
+                err = 0;
+        }
+        return err ? err : len;
+}
+static struct rdev_sysfs_entry rdev_state =
+__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
 static ssize_t
 super_show(mdk_rdev_t *rdev, char *page)
@@ -1707,7 +1828,7 @@ errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
        return -EINVAL;
 }
 static struct rdev_sysfs_entry rdev_errors =
-__ATTR(errors, 0644, errors_show, errors_store);
+__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
 static ssize_t
 slot_show(mdk_rdev_t *rdev, char *page)
@@ -1741,7 +1862,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 static struct rdev_sysfs_entry rdev_slot =
-__ATTR(slot, 0644, slot_show, slot_store);
+__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
 static ssize_t
 offset_show(mdk_rdev_t *rdev, char *page)
@@ -1763,7 +1884,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 }
 static struct rdev_sysfs_entry rdev_offset =
-__ATTR(offset, 0644, offset_show, offset_store);
+__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
 static ssize_t
 rdev_size_show(mdk_rdev_t *rdev, char *page)
@@ -1787,7 +1908,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 }
 static struct rdev_sysfs_entry rdev_size =
-__ATTR(size, 0644, rdev_size_show, rdev_size_store);
+__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
 static struct attribute *rdev_default_attrs[] = {
        &rdev_state.attr,
@@ -1818,6 +1939,8 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
        if (!entry->store)
                return -EIO;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
        return entry->store(rdev, page, length);
 }
@@ -1873,6 +1996,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
        rdev->desc_nr = -1;
        rdev->flags = 0;
        rdev->data_offset = 0;
+        rdev->sb_events = 0;
        atomic_set(&rdev->nr_pending, 0);
        atomic_set(&rdev->read_errors, 0);
        atomic_set(&rdev->corrected_errors, 0);
@@ -1978,6 +2102,54 @@ static void analyze_sbs(mddev_t * mddev)
 }
 static ssize_t
+safe_delay_show(mddev_t *mddev, char *page)
+{
+        int msec = (mddev->safemode_delay*1000)/HZ;
+        return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
+}
+static ssize_t
+safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
+{
+        int scale=1;
+        int dot=0;
+        int i;
+        unsigned long msec;
+        char buf[30];
+        char *e;
+        /* remove a period, and count digits after it */
+        if (len >= sizeof(buf))
+                return -EINVAL;
+        strlcpy(buf, cbuf, len);
+        buf[len] = 0;
+        for (i=0; i<len; i++) {
+                if (dot) {
+                        if (isdigit(buf[i])) {
+                                buf[i-1] = buf[i];
+                                scale *= 10;
+                        }
+                        buf[i] = 0;
+                } else if (buf[i] == '.') {
+                        dot=1;
+                        buf[i] = 0;
+                }
+        }
+        msec = simple_strtoul(buf, &e, 10);
+        if (e == buf || (*e && *e != '\n'))
+                return -EINVAL;
+        msec = (msec * 1000) / scale;
+        if (msec == 0)
+                mddev->safemode_delay = 0;
+        else {
+                mddev->safemode_delay = (msec*HZ)/1000;
+                if (mddev->safemode_delay == 0)
+                        mddev->safemode_delay = 1;
+        }
+        return len;
+}
+static struct md_sysfs_entry md_safe_delay =
+__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
+static ssize_t
 level_show(mddev_t *mddev, char *page)
 {
        struct mdk_personality *p = mddev->pers;
@@ -2010,7 +2182,33 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 }
 static struct md_sysfs_entry md_level =
-__ATTR(level, 0644, level_show, level_store);
+__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
+static ssize_t
+layout_show(mddev_t *mddev, char *page)
+{
+        /* just a number, not meaningful for all levels */
+        return sprintf(page, "%d\n", mddev->layout);
+}
+static ssize_t
+layout_store(mddev_t *mddev, const char *buf, size_t len)
+{
+        char *e;
+        unsigned long n = simple_strtoul(buf, &e, 10);
+        if (mddev->pers)
+                return -EBUSY;
+        if (!*buf || (*e && *e != '\n'))
+                return -EINVAL;
+        mddev->layout = n;
+        return len;
+}
+static struct md_sysfs_entry md_layout =
+__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
 static ssize_t
 raid_disks_show(mddev_t *mddev, char *page)
@@ -2040,7 +2238,7 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
        return rv ? rv : len;
 }
 static struct md_sysfs_entry md_raid_disks =
-__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
+__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
 static ssize_t
 chunk_size_show(mddev_t *mddev, char *page)
@@ -2064,7 +2262,202 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
        return len;
 }
 static struct md_sysfs_entry md_chunk_size =
-__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
+__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
+static ssize_t
+resync_start_show(mddev_t *mddev, char *page)
+{
+        return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
+}
+static ssize_t
+resync_start_store(mddev_t *mddev, const char *buf, size_t len)
+{
+        /* can only set chunk_size if array is not yet active */
+        char *e;
+        unsigned long long n = simple_strtoull(buf, &e, 10);
+        if (mddev->pers)
+                return -EBUSY;
+        if (!*buf || (*e && *e != '\n'))
+                return -EINVAL;
+        mddev->recovery_cp = n;
+        return len;
+}
+static struct md_sysfs_entry md_resync_start =
+__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
+/*
+ * The array state can be:
+ *
+ * clear
+ *     No devices, no size, no level
+ *     Equivalent to STOP_ARRAY ioctl
+ * inactive
+ *     May have some settings, but array is not active
+ *        all IO results in error
+ *     When written, doesn't tear down array, but just stops it
+ * suspended (not supported yet)
+ *     All IO requests will block. The array can be reconfigured.
+ *     Writing this, if accepted, will block until array is quiessent
+ * readonly
+ *     no resync can happen.  no superblocks get written.
+ *     write requests fail
+ * read-auto
+ *     like readonly, but behaves like 'clean' on a write request.
+ *
+ * clean - no pending writes, but otherwise active.
+ *     When written to inactive array, starts without resync
+ *     If a write request arrives then
+ *       if metadata is known, mark 'dirty' and switch to 'active'.
+ *       if not known, block and switch to write-pending
+ *     If written to an active array that has pending writes, then fails.
+ * active
+ *     fully active: IO and resync can be happening.
+ *     When written to inactive array, starts with resync
+ *
+ * write-pending
+ *     clean, but writes are blocked waiting for 'active' to be written.
+ *
+ * active-idle
+ *     like active, but no writes have been seen for a while (100msec).
+ *
+ */
+enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
+                   write_pending, active_idle, bad_word};
+static char *array_states[] = {
+        "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
+        "write-pending", "active-idle", NULL };
+static int match_word(const char *word, char **list)
+{
+        int n;
+        for (n=0; list[n]; n++)
+                if (cmd_match(word, list[n]))
+                        break;
+        return n;
+}
+static ssize_t
+array_state_show(mddev_t *mddev, char *page)
+{
+        enum array_state st = inactive;
+        if (mddev->pers)
+                switch(mddev->ro) {
+                case 1:
+                        st = readonly;
+                        break;
+                case 2:
+                        st = read_auto;
+                        break;
+                case 0:
+                        if (mddev->in_sync)
+                                st = clean;
+                        else if (mddev->safemode)
+                                st = active_idle;
+                        else
+                                st = active;
+                }
+        else {
+                if (list_empty(&mddev->disks) &&
+                    mddev->raid_disks == 0 &&
+                    mddev->size == 0)
+                        st = clear;
+                else
+                        st = inactive;
+        }
+        return sprintf(page, "%s\n", array_states[st]);
+}
+static int do_md_stop(mddev_t * mddev, int ro);
+static int do_md_run(mddev_t * mddev);
+static int restart_array(mddev_t *mddev);
+static ssize_t
+array_state_store(mddev_t *mddev, const char *buf, size_t len)
+{
+        int err = -EINVAL;
+        enum array_state st = match_word(buf, array_states);
+        switch(st) {
+        case bad_word:
+                break;
+        case clear:
+                /* stopping an active array */
+                if (mddev->pers) {
+                        if (atomic_read(&mddev->active) > 1)
+                                return -EBUSY;
+                        err = do_md_stop(mddev, 0);
+                }
+                break;
+        case inactive:
+                /* stopping an active array */
+                if (mddev->pers) {
+                        if (atomic_read(&mddev->active) > 1)
+                                return -EBUSY;
+                        err = do_md_stop(mddev, 2);
+                }
+                break;
+        case suspended:
+                break; /* not supported yet */
+        case readonly:
+                if (mddev->pers)
+                        err = do_md_stop(mddev, 1);
+                else {
+                        mddev->ro = 1;
+                        err = do_md_run(mddev);
+                }
+                break;
+        case read_auto:
+                /* stopping an active array */
+                if (mddev->pers) {
+                        err = do_md_stop(mddev, 1);
+                        if (err == 0)
+                                mddev->ro = 2; /* FIXME mark devices writable */
+                } else {
+                        mddev->ro = 2;
+                        err = do_md_run(mddev);
+                }
+                break;
+        case clean:
+                if (mddev->pers) {
+                        restart_array(mddev);
+                        spin_lock_irq(&mddev->write_lock);
+                        if (atomic_read(&mddev->writes_pending) == 0) {
+                                mddev->in_sync = 1;
+                                mddev->sb_dirty = 1;
+                        }
+                        spin_unlock_irq(&mddev->write_lock);
+                } else {
+                        mddev->ro = 0;
+                        mddev->recovery_cp = MaxSector;
+                        err = do_md_run(mddev);
+                }
+                break;
+        case active:
+                if (mddev->pers) {
+                        restart_array(mddev);
+                        mddev->sb_dirty = 0;
+                        wake_up(&mddev->sb_wait);
+                        err = 0;
+                } else {
+                        mddev->ro = 0;
+                        err = do_md_run(mddev);
+                }
+                break;
+        case write_pending:
+        case active_idle:
+                /* these cannot be set */
+                break;
+        }
+        if (err)
+                return err;
+        else
+                return len;
+}
+static struct md_sysfs_entry md_array_state =
+__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
 static ssize_t
 null_show(mddev_t *mddev, char *page)
@@ -2124,7 +2517,7 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len)
 }
 static struct md_sysfs_entry md_new_device =
-__ATTR(new_dev, 0200, null_show, new_dev_store);
+__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
 static ssize_t
 size_show(mddev_t *mddev, char *page)
@@ -2162,7 +2555,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
 }
 static struct md_sysfs_entry md_size =
-__ATTR(component_size, 0644, size_show, size_store);
+__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
 /* Metdata version.
@@ -2210,7 +2603,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
 }
 static struct md_sysfs_entry md_metadata =
-__ATTR(metadata_version, 0644, metadata_show, metadata_store);
+__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
 static ssize_t
 action_show(mddev_t *mddev, char *page)
@@ -2278,12 +2671,11 @@ mismatch_cnt_show(mddev_t *mddev, char *page)
                       (unsigned long long) mddev->resync_mismatches);
 }
-static struct md_sysfs_entry
+static struct md_sysfs_entry md_scan_mode =
-md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
-static struct md_sysfs_entry
+static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
-md_mismatches = __ATTR_RO(mismatch_cnt);
 static ssize_t
 sync_min_show(mddev_t *mddev, char *page)
@@ -2342,15 +2734,14 @@ static ssize_t
 sync_speed_show(mddev_t *mddev, char *page)
 {
        unsigned long resync, dt, db;
-        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+        resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
        dt = ((jiffies - mddev->resync_mark) / HZ);
        if (!dt) dt++;
        db = resync - (mddev->resync_mark_cnt);
        return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
 }
-static struct md_sysfs_entry
+static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
-md_sync_speed = __ATTR_RO(sync_speed);
 static ssize_t
 sync_completed_show(mddev_t *mddev, char *page)
@@ -2366,8 +2757,7 @@ sync_completed_show(mddev_t *mddev, char *page)
        return sprintf(page, "%lu / %lu\n", resync, max_blocks);
 }
-static struct md_sysfs_entry
+static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
-md_sync_completed = __ATTR_RO(sync_completed);
 static ssize_t
 suspend_lo_show(mddev_t *mddev, char *page)
@@ -2428,11 +2818,15 @@ __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
 static struct attribute *md_default_attrs[] = {
        &md_level.attr,
+        &md_layout.attr,
        &md_raid_disks.attr,
        &md_chunk_size.attr,
        &md_size.attr,
+        &md_resync_start.attr,
        &md_metadata.attr,
        &md_new_device.attr,
+        &md_safe_delay.attr,
+        &md_array_state.attr,
        NULL,
 };
@@ -2480,6 +2874,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
        if (!entry->store)
                return -EIO;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
        rv = mddev_lock(mddev);
        if (!rv) {
                rv = entry->store(mddev, page, length);
@@ -2532,13 +2928,10 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
        }
        disk->major = MAJOR(dev);
        disk->first_minor = unit << shift;
-        if (partitioned) {
+        if (partitioned)
                sprintf(disk->disk_name, "md_d%d", unit);
-                sprintf(disk->devfs_name, "md/d%d", unit);
+        else
-        } else {
                sprintf(disk->disk_name, "md%d", unit);
-                sprintf(disk->devfs_name, "md/%d", unit);
-        }
        disk->fops = &md_fops;
        disk->private_data = mddev;
        disk->queue = mddev->queue;
@@ -2553,8 +2946,6 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
        return NULL;
 }
-void md_wakeup_thread(mdk_thread_t *thread);
 static void md_safemode_timeout(unsigned long data)
 {
        mddev_t *mddev = (mddev_t *) data;
@@ -2708,7 +3099,7 @@ static int do_md_run(mddev_t * mddev)
        mddev->safemode = 0;
        mddev->safemode_timer.function = md_safemode_timeout;
        mddev->safemode_timer.data = (unsigned long) mddev;
-        mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
+        mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
        mddev->in_sync = 1;
        ITERATE_RDEV(mddev,rdev,tmp)
@@ -2719,7 +3110,6 @@ static int do_md_run(mddev_t * mddev)
                }
        
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-        md_wakeup_thread(mddev->thread);
        
        if (mddev->sb_dirty)
                md_update_sb(mddev);
@@ -2736,6 +3126,37 @@ static int do_md_run(mddev_t * mddev)
        mddev->queue->queuedata = mddev;
        mddev->queue->make_request_fn = mddev->pers->make_request;
+        /* If there is a partially-recovered drive we need to
+         * start recovery here.  If we leave it to md_check_recovery,
+         * it will remove the drives and not do the right thing
+         */
+        if (mddev->degraded && !mddev->sync_thread) {
+                struct list_head *rtmp;
+                int spares = 0;
+                ITERATE_RDEV(mddev,rdev,rtmp)
+                        if (rdev->raid_disk >= 0 &&
+                            !test_bit(In_sync, &rdev->flags) &&
+                            !test_bit(Faulty, &rdev->flags))
+                                /* complete an interrupted recovery */
+                                spares++;
+                if (spares && mddev->pers->sync_request) {
+                        mddev->recovery = 0;
+                        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+                        mddev->sync_thread = md_register_thread(md_do_sync,
+                                                                mddev,
+                                                                "%s_resync");
+                        if (!mddev->sync_thread) {
+                                printk(KERN_ERR "%s: could not start resync"
+                                       " thread...\n",
+                                       mdname(mddev));
+                                /* leave the spares where they are, it shouldn't hurt */
+                                mddev->recovery = 0;
+                        }
+                }
+        }
+        md_wakeup_thread(mddev->thread);
+        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
        mddev->changed = 1;
        md_new_event(mddev);
        return 0;
@@ -2769,18 +3190,47 @@ static int restart_array(mddev_t *mddev)
                 */
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                md_wakeup_thread(mddev->thread);
+                md_wakeup_thread(mddev->sync_thread);
                err = 0;
-        } else {
+        } else
-                printk(KERN_ERR "md: %s has no personality assigned.\n",
-                        mdname(mddev));
                err = -EINVAL;
-        }
 out:
        return err;
 }
-static int do_md_stop(mddev_t * mddev, int ro)
+/* similar to deny_write_access, but accounts for our holding a reference
+ * to the file ourselves */
+static int deny_bitmap_write_access(struct file * file)
+{
+        struct inode *inode = file->f_mapping->host;
+        spin_lock(&inode->i_lock);
+        if (atomic_read(&inode->i_writecount) > 1) {
+                spin_unlock(&inode->i_lock);
+                return -ETXTBSY;
+        }
+        atomic_set(&inode->i_writecount, -1);
+        spin_unlock(&inode->i_lock);
+        return 0;
+}
+static void restore_bitmap_write_access(struct file *file)
+{
+        struct inode *inode = file->f_mapping->host;
+        spin_lock(&inode->i_lock);
+        atomic_set(&inode->i_writecount, 1);
+        spin_unlock(&inode->i_lock);
+}
+/* mode:
+ *   0 - completely stop and dis-assemble array
+ *   1 - switch to readonly
+ *   2 - stop but do not disassemble array
+ */
+static int do_md_stop(mddev_t * mddev, int mode)
 {
        int err = 0;
        struct gendisk *disk = mddev->gendisk;
@@ -2792,6 +3242,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
                }
                if (mddev->sync_thread) {
+                        set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
                        md_unregister_thread(mddev->sync_thread);
                        mddev->sync_thread = NULL;
@@ -2801,12 +3252,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
                invalidate_partition(disk, 0);
-                if (ro) {
+                switch(mode) {
+                case 1: /* readonly */
                        err  = -ENXIO;
                        if (mddev->ro==1)
                                goto out;
                        mddev->ro = 1;
-                } else {
+                        break;
+                case 0: /* disassemble */
+                case 2: /* stop */
                        bitmap_flush(mddev);
                        md_super_wait(mddev);
                        if (mddev->ro)
@@ -2821,19 +3275,20 @@ static int do_md_stop(mddev_t * mddev, int ro)
                        if (mddev->ro)
                                mddev->ro = 0;
                }
-                if (!mddev->in_sync) {
+                if (!mddev->in_sync || mddev->sb_dirty) {
                        /* mark array as shutdown cleanly */
                        mddev->in_sync = 1;
                        md_update_sb(mddev);
                }
-                if (ro)
+                if (mode == 1)
                        set_disk_ro(disk, 1);
+                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
        }
        /*
         * Free resources if final stop
         */
-        if (!ro) {
+        if (mode == 0) {
                mdk_rdev_t *rdev;
                struct list_head *tmp;
                struct gendisk *disk;
@@ -2841,7 +3296,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
                bitmap_destroy(mddev);
                if (mddev->bitmap_file) {
-                        atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1);
+                        restore_bitmap_write_access(mddev->bitmap_file);
                        fput(mddev->bitmap_file);
                        mddev->bitmap_file = NULL;
                }
@@ -2857,11 +3312,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
                export_array(mddev);
                mddev->array_size = 0;
+                mddev->size = 0;
+                mddev->raid_disks = 0;
+                mddev->recovery_cp = 0;
                disk = mddev->gendisk;
                if (disk)
                        set_capacity(disk, 0);
                mddev->changed = 1;
-        } else
+        } else if (mddev->pers)
                printk(KERN_INFO "md: %s switched to read-only mode.\n",
                        mdname(mddev));
        err = 0;
@@ -3264,6 +3723,17 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                rdev->raid_disk = -1;
                err = bind_rdev_to_array(rdev, mddev);
+                if (!err && !mddev->pers->hot_remove_disk) {
+                        /* If there is hot_add_disk but no hot_remove_disk
+                         * then added disks for geometry changes,
+                         * and should be added immediately.
+                         */
+                        super_types[mddev->major_version].
+                                validate_super(mddev, rdev);
+                        err = mddev->pers->hot_add_disk(mddev, rdev);
+                        if (err)
+                                unbind_rdev_from_array(rdev);
+                }
                if (err)
                        export_rdev(rdev);
@@ -3434,23 +3904,6 @@ abort_export:
        return err;
 }
-/* similar to deny_write_access, but accounts for our holding a reference
- * to the file ourselves */
-static int deny_bitmap_write_access(struct file * file)
-{
-        struct inode *inode = file->f_mapping->host;
-        spin_lock(&inode->i_lock);
-        if (atomic_read(&inode->i_writecount) > 1) {
-                spin_unlock(&inode->i_lock);
-                return -ETXTBSY;
-        }
-        atomic_set(&inode->i_writecount, -1);
-        spin_unlock(&inode->i_lock);
-        return 0;
-}
 static int set_bitmap_file(mddev_t *mddev, int fd)
 {
        int err;
@@ -3491,12 +3944,17 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
                mddev->pers->quiesce(mddev, 1);
                if (fd >= 0)
                        err = bitmap_create(mddev);
-                if (fd < 0 || err)
+                if (fd < 0 || err) {
                        bitmap_destroy(mddev);
+                        fd = -1; /* make sure to put the file */
+                }
                mddev->pers->quiesce(mddev, 0);
-        } else if (fd < 0) {
+        }
-                if (mddev->bitmap_file)
+        if (fd < 0) {
+                if (mddev->bitmap_file) {
+                        restore_bitmap_write_access(mddev->bitmap_file);
                        fput(mddev->bitmap_file);
+                }
                mddev->bitmap_file = NULL;
        }
@@ -3977,11 +4435,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
                        goto done_unlock;
                default:
-                        if (_IOC_TYPE(cmd) == MD_MAJOR)
-                                printk(KERN_WARNING "md: %s(pid %d) used"
-                                        " obsolete MD ioctl, upgrade your"
-                                        " software to use new ictls.\n",
-                                        current->comm, current->pid);
                        err = -EINVAL;
                        goto abort_unlock;
        }
@@ -4152,6 +4605,8 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
                __builtin_return_address(0),__builtin_return_address(1),
                __builtin_return_address(2),__builtin_return_address(3));
 */
+        if (!mddev->pers)
+                return;
        if (!mddev->pers->error_handler)
                return;
        mddev->pers->error_handler(mddev,rdev);
@@ -4249,12 +4704,13 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
         */
        dt = ((jiffies - mddev->resync_mark) / HZ);
        if (!dt) dt++;
-        db = resync - (mddev->resync_mark_cnt/2);
+        db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
-        rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100;
+                - mddev->resync_mark_cnt;
+        rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
        seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
-        seq_printf(seq, " speed=%ldK/sec", db/dt);
+        seq_printf(seq, " speed=%ldK/sec", db/2/dt);
 }
 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
@@ -4586,7 +5042,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
                spin_lock_irq(&mddev->write_lock);
                if (mddev->in_sync) {
                        mddev->in_sync = 0;
-                        mddev->sb_dirty = 1;
+                        mddev->sb_dirty = 3;
                        md_wakeup_thread(mddev->thread);
                }
                spin_unlock_irq(&mddev->write_lock);
@@ -4599,7 +5055,7 @@ void md_write_end(mddev_t *mddev)
        if (atomic_dec_and_test(&mddev->writes_pending)) {
                if (mddev->safemode == 2)
                        md_wakeup_thread(mddev->thread);
-                else
+                else if (mddev->safemode_delay)
                        mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
        }
 }
@@ -4620,10 +5076,14 @@ void md_do_sync(mddev_t *mddev)
        struct list_head *tmp;
        sector_t last_check;
        int skipped = 0;
+        struct list_head *rtmp;
+        mdk_rdev_t *rdev;
        /* just incase thread restarts... */
        if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
                return;
+        if (mddev->ro) /* never try to sync a read-only array */
+                return;
        /* we overload curr_resync somewhat here.
         * 0 == not engaged in resync at all
@@ -4682,17 +5142,30 @@ void md_do_sync(mddev_t *mddev)
                }
        } while (mddev->curr_resync < 2);
+        j = 0;
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
                /* resync follows the size requested by the personality,
                 * which defaults to physical size, but can be virtual size
                 */
                max_sectors = mddev->resync_max_sectors;
                mddev->resync_mismatches = 0;
+                /* we don't use the checkpoint if there's a bitmap */
+                if (!mddev->bitmap &&
+                    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+                        j = mddev->recovery_cp;
        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
                max_sectors = mddev->size << 1;
-        else
+        else {
                /* recovery follows the physical size of devices */
                max_sectors = mddev->size << 1;
+                j = MaxSector;
+                ITERATE_RDEV(mddev,rdev,rtmp)
+                        if (rdev->raid_disk >= 0 &&
+                            !test_bit(Faulty, &rdev->flags) &&
+                            !test_bit(In_sync, &rdev->flags) &&
+                            rdev->recovery_offset < j)
+                                j = rdev->recovery_offset;
+        }
        printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
        printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
@@ -4702,12 +5175,7 @@ void md_do_sync(mddev_t *mddev)
               speed_max(mddev));
        is_mddev_idle(mddev); /* this also initializes IO event counters */
-        /* we don't use the checkpoint if there's a bitmap */
-        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
-            && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
-                j = mddev->recovery_cp;
-        else
-                j = 0;
        io_sectors = 0;
        for (m = 0; m < SYNC_MARKS; m++) {
                mark[m] = jiffies;
@@ -4753,6 +5221,7 @@ void md_do_sync(mddev_t *mddev)
                j += sectors;
                if (j>1) mddev->curr_resync = j;
+                mddev->curr_mark_cnt = io_sectors;
                if (last_check == 0)
                        /* this is the earliers that rebuilt will be
                         * visible in /proc/mdstat
@@ -4828,15 +5297,28 @@ void md_do_sync(mddev_t *mddev)
        if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
            test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
            !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
-            mddev->curr_resync > 2 &&
+            mddev->curr_resync > 2) {
-            mddev->curr_resync >= mddev->recovery_cp) {
+                if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+                        if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
-                        printk(KERN_INFO 
+                                if (mddev->curr_resync >= mddev->recovery_cp) {
-                                "md: checkpointing recovery of %s.\n",
+                                        printk(KERN_INFO
-                                mdname(mddev));
+                                               "md: checkpointing recovery of %s.\n",
-                        mddev->recovery_cp = mddev->curr_resync;
+                                               mdname(mddev));
-                } else
+                                        mddev->recovery_cp = mddev->curr_resync;
-                        mddev->recovery_cp = MaxSector;
+                                }
+                        } else
+                                mddev->recovery_cp = MaxSector;
+                } else {
+                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+                                mddev->curr_resync = MaxSector;
+                        ITERATE_RDEV(mddev,rdev,rtmp)
+                                if (rdev->raid_disk >= 0 &&
+                                    !test_bit(Faulty, &rdev->flags) &&
+                                    !test_bit(In_sync, &rdev->flags) &&
+                                    rdev->recovery_offset < mddev->curr_resync)
+                                        rdev->recovery_offset = mddev->curr_resync;
+                        mddev->sb_dirty = 1;
+                }
        }
 skip:
@@ -4908,7 +5390,7 @@ void md_check_recovery(mddev_t *mddev)
                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
                    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
                        mddev->in_sync = 1;
-                        mddev->sb_dirty = 1;
+                        mddev->sb_dirty = 3;
                }
                if (mddev->safemode == 1)
                        mddev->safemode = 0;
@@ -4957,6 +5439,8 @@ void md_check_recovery(mddev_t *mddev)
                clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
                clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
+                if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+                        goto unlock;
                /* no recovery is running.
                 * remove any failed drives, then
                 * add spares if possible.
@@ -4979,6 +5463,7 @@ void md_check_recovery(mddev_t *mddev)
                        ITERATE_RDEV(mddev,rdev,rtmp)
                                if (rdev->raid_disk < 0
                                    && !test_bit(Faulty, &rdev->flags)) {
+                                        rdev->recovery_offset = 0;
                                        if (mddev->pers->hot_add_disk(mddev,rdev)) {
                                                char nm[20];
                                                sprintf(nm, "rd%d", rdev->raid_disk);
@@ -5071,8 +5556,6 @@ static void md_geninit(void)
 static int __init md_init(void)
 {
-        int minor;
        printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
                        " MD_SB_DISKS=%d\n",
                        MD_MAJOR_VERSION, MD_MINOR_VERSION,
@@ -5086,23 +5569,11 @@ static int __init md_init(void)
                unregister_blkdev(MAJOR_NR, "md");
                return -1;
        }
-        devfs_mk_dir("md");
        blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
                                md_probe, NULL, NULL);
        blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
                            md_probe, NULL, NULL);
-        for (minor=0; minor < MAX_MD_DEVS; ++minor)
-                devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
-                                S_IFBLK|S_IRUSR|S_IWUSR,
-                                "md/%d", minor);
-        for (minor=0; minor < MAX_MD_DEVS; ++minor)
-                devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
-                              S_IFBLK|S_IRUSR|S_IWUSR,
-                              "md/mdp%d", minor);
        register_reboot_notifier(&md_notifier);
        raid_table_header = register_sysctl_table(raid_root_table, 1);
@@ -5158,15 +5629,9 @@ static __exit void md_exit(void)
 {
        mddev_t *mddev;
        struct list_head *tmp;
-        int i;
        blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
        blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
-        for (i=0; i < MAX_MD_DEVS; i++)
-                devfs_remove("md/%d", i);
-        for (i=0; i < MAX_MD_DEVS; i++)
-                devfs_remove("md/d%d", i);
-        devfs_remove("md");
        unregister_blkdev(MAJOR_NR,"md");
        unregister_blkdev(mdp_major, "mdp");
@@ -5203,8 +5668,8 @@ static int set_ro(const char *val, struct kernel_param *kp)
        return -EINVAL;
 }
-module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
+module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
-module_param(start_dirty_degraded, int, 0644);
+module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
 EXPORT_SYMBOL(register_md_personality);
@@ -5216,7 +5681,6 @@ EXPORT_SYMBOL(md_write_end);
 EXPORT_SYMBOL(md_register_thread);
 EXPORT_SYMBOL(md_unregister_thread);
 EXPORT_SYMBOL(md_wakeup_thread);
-EXPORT_SYMBOL(md_print_devices);
 EXPORT_SYMBOL(md_check_recovery);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4070eff6f0f8..3b4d69c05623 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -374,26 +374,26 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
         * already.
         */
        if (atomic_dec_and_test(&r1_bio->remaining)) {
-                if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
+                if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
                        reschedule_retry(r1_bio);
-                        goto out;
+                else {
-                }
+                        /* it really is the end of this request */
-                /* it really is the end of this request */
+                        if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-                if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+                                /* free extra copy of the data pages */
-                        /* free extra copy of the data pages */
+                                int i = bio->bi_vcnt;
-                        int i = bio->bi_vcnt;
+                                while (i--)
-                        while (i--)
+                                        safe_put_page(bio->bi_io_vec[i].bv_page);
-                                safe_put_page(bio->bi_io_vec[i].bv_page);
+                        }
+                        /* clear the bitmap if all writes complete successfully */
+                        bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+                                        r1_bio->sectors,
+                                        !test_bit(R1BIO_Degraded, &r1_bio->state),
+                                        behind);
+                        md_write_end(r1_bio->mddev);
+                        raid_end_bio_io(r1_bio);
                }
-                /* clear the bitmap if all writes complete successfully */
-                bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
-                                r1_bio->sectors,
-                                !test_bit(R1BIO_Degraded, &r1_bio->state),
-                                behind);
-                md_write_end(r1_bio->mddev);
-                raid_end_bio_io(r1_bio);
        }
- out:
        if (to_put)
                bio_put(to_put);
@@ -930,10 +930,13 @@ static void status(struct seq_file *seq, mddev_t *mddev)
        seq_printf(seq, " [%d/%d] [", conf->raid_disks,
                                                conf->working_disks);
-        for (i = 0; i < conf->raid_disks; i++)
+        rcu_read_lock();
+        for (i = 0; i < conf->raid_disks; i++) {
+                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
                seq_printf(seq, "%s",
-                              conf->mirrors[i].rdev &&
+                           rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
-                              test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
+        }
+        rcu_read_unlock();
        seq_printf(seq, "]");
 }
@@ -975,7 +978,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 static void print_conf(conf_t *conf)
 {
        int i;
-        mirror_info_t *tmp;
        printk("RAID1 conf printout:\n");
        if (!conf) {
@@ -985,14 +987,17 @@ static void print_conf(conf_t *conf)
        printk(" --- wd:%d rd:%d\n", conf->working_disks,
                conf->raid_disks);
+        rcu_read_lock();
        for (i = 0; i < conf->raid_disks; i++) {
                char b[BDEVNAME_SIZE];
-                tmp = conf->mirrors + i;
+                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-                if (tmp->rdev)
+                if (rdev)
                        printk(" disk %d, wo:%d, o:%d, dev:%s\n",
-                                i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags),
+                               i, !test_bit(In_sync, &rdev->flags),
-                                bdevname(tmp->rdev->bdev,b));
+                               !test_bit(Faulty, &rdev->flags),
+                               bdevname(rdev->bdev,b));
        }
+        rcu_read_unlock();
 }
 static void close_sync(conf_t *conf)
@@ -1008,20 +1013,20 @@ static int raid1_spare_active(mddev_t *mddev)
 {
        int i;
        conf_t *conf = mddev->private;
-        mirror_info_t *tmp;
        /*
         * Find all failed disks within the RAID1 configuration 
-         * and mark them readable
+         * and mark them readable.
+         * Called under mddev lock, so rcu protection not needed.
         */
        for (i = 0; i < conf->raid_disks; i++) {
-                tmp = conf->mirrors + i;
+                mdk_rdev_t *rdev = conf->mirrors[i].rdev;
-                if (tmp->rdev 
+                if (rdev
-                    && !test_bit(Faulty, &tmp->rdev->flags)
+                    && !test_bit(Faulty, &rdev->flags)
-                    && !test_bit(In_sync, &tmp->rdev->flags)) {
+                    && !test_bit(In_sync, &rdev->flags)) {
                        conf->working_disks++;
                        mddev->degraded--;
-                        set_bit(In_sync, &tmp->rdev->flags);
+                        set_bit(In_sync, &rdev->flags);
                }
        }
@@ -1145,7 +1150,7 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
                long sectors_to_go = r1_bio->sectors;
                /* make sure these bits doesn't get cleared. */
                do {
-                        bitmap_end_sync(mddev->bitmap, r1_bio->sector,
+                        bitmap_end_sync(mddev->bitmap, s,
                                        &sync_blocks, 1);
                        s += sync_blocks;
                        sectors_to_go -= sync_blocks;
@@ -1237,7 +1242,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
                /* ouch - failed to read all of that.
                 * Try some synchronous reads of other devices to get
                 * good data, much like with normal read errors.  Only
-                 * read into the pages we already have so they we don't
+                 * read into the pages we already have so we don't
                 * need to re-issue the read request.
                 * We don't need to freeze the array, because being in an
                 * active sync request, there is no normal IO, and
@@ -1257,6 +1262,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
                                s = PAGE_SIZE >> 9;
                        do {
                                if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
+                                        /* No rcu protection needed here devices
+                                         * can only be removed when no resync is
+                                         * active, and resync is currently active
+                                         */
                                        rdev = conf->mirrors[d].rdev;
                                        if (sync_page_io(rdev->bdev,
                                                         sect + rdev->data_offset,
@@ -1463,6 +1472,11 @@ static void raid1d(mddev_t *mddev)
                                        s = PAGE_SIZE >> 9;
                                do {
+                                        /* Note: no rcu protection needed here
+                                         * as this is synchronous in the raid1d thread
+                                         * which is the thread that might remove
+                                         * a device.  If raid1d ever becomes multi-threaded....
+                                         */
                                        rdev = conf->mirrors[d].rdev;
                                        if (rdev &&
                                            test_bit(In_sync, &rdev->flags) &&
@@ -1486,7 +1500,6 @@ static void raid1d(mddev_t *mddev)
                                                        d = conf->raid_disks;
                                                d--;
                                                rdev = conf->mirrors[d].rdev;
-                                                atomic_add(s, &rdev->corrected_errors);
                                                if (rdev &&
                                                    test_bit(In_sync, &rdev->flags)) {
                                                        if (sync_page_io(rdev->bdev,
@@ -1509,6 +1522,11 @@ static void raid1d(mddev_t *mddev)
                                                                         s<<9, conf->tmppage, READ) == 0)
                                                                /* Well, this device is dead */
                                                                md_error(mddev, rdev);
+                                                        else {
+                                                                atomic_add(s, &rdev->corrected_errors);
+                                                                printk(KERN_INFO "raid1:%s: read error corrected (%d sectors at %llu on %s)\n",
+                                                                       mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b));
+                                                        }
                                                }
                                        }
                                } else {
@@ -1622,6 +1640,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                return 0;
        }
+        if (mddev->bitmap == NULL &&
+            mddev->recovery_cp == MaxSector &&
+            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+            conf->fullsync == 0) {
+                *skipped = 1;
+                return max_sector - sector_nr;
+        }
        /* before building a request, check if we can skip these blocks..
         * This call the bitmap_start_sync doesn't actually record anything
         */
@@ -1777,19 +1802,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                for (i=0; i<conf->raid_disks; i++) {
                        bio = r1_bio->bios[i];
                        if (bio->bi_end_io == end_sync_read) {
-                                md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors);
+                                md_sync_acct(bio->bi_bdev, nr_sectors);
                                generic_make_request(bio);
                        }
                }
        } else {
                atomic_set(&r1_bio->remaining, 1);
                bio = r1_bio->bios[r1_bio->read_disk];
-                md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev,
+                md_sync_acct(bio->bi_bdev, nr_sectors);
-                             nr_sectors);
                generic_make_request(bio);
        }
        return nr_sectors;
 }
@@ -1888,7 +1911,8 @@ static int run(mddev_t *mddev)
                disk = conf->mirrors + i;
-                if (!disk->rdev) {
+                if (!disk->rdev ||
+                    !test_bit(In_sync, &disk->rdev->flags)) {
                        disk->head_position = 0;
                        mddev->degraded++;
                }
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1440935414e6..016ddb831c9b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -29,6 +29,7 @@
 *    raid_disks
 *    near_copies (stored in low byte of layout)
 *    far_copies (stored in second byte of layout)
+ *    far_offset (stored in bit 16 of layout )
 *
 * The data to be stored is divided into chunks using chunksize.
 * Each device is divided into far_copies sections.
@@ -36,10 +37,14 @@
 * near_copies copies of each chunk is stored (each on a different drive).
 * The starting device for each section is offset near_copies from the starting
 * device of the previous section.
- * Thus there are (near_copies*far_copies) of each chunk, and each is on a different
+ * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
 * drive.
 * near_copies and far_copies must be at least one, and their product is at most
 * raid_disks.
+ *
+ * If far_offset is true, then the far_copies are handled a bit differently.
+ * The copies are still in different stripes, but instead of be very far apart
+ * on disk, there are adjacent stripes.
 */
 /*
@@ -357,8 +362,7 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
 * With this layout, and block is never stored twice on the one device.
 *
 * raid10_find_phys finds the sector offset of a given virtual sector
- * on each device that it is on. If a block isn't on a device,
+ * on each device that it is on.
- * that entry in the array is set to MaxSector.
 *
 * raid10_find_virt does the reverse mapping, from a device and a
 * sector offset to a virtual address
@@ -381,6 +385,8 @@ static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
        chunk *= conf->near_copies;
        stripe = chunk;
        dev = sector_div(stripe, conf->raid_disks);
+        if (conf->far_offset)
+                stripe *= conf->far_copies;
        sector += stripe << conf->chunk_shift;
@@ -414,16 +420,24 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
 {
        sector_t offset, chunk, vchunk;
-        while (sector > conf->stride) {
-                sector -= conf->stride;
-                if (dev < conf->near_copies)
-                        dev += conf->raid_disks - conf->near_copies;
-                else
-                        dev -= conf->near_copies;
-        }
        offset = sector & conf->chunk_mask;
-        chunk = sector >> conf->chunk_shift;
+        if (conf->far_offset) {
+                int fc;
+                chunk = sector >> conf->chunk_shift;
+                fc = sector_div(chunk, conf->far_copies);
+                dev -= fc * conf->near_copies;
+                if (dev < 0)
+                        dev += conf->raid_disks;
+        } else {
+                while (sector > conf->stride) {
+                        sector -= conf->stride;
+                        if (dev < conf->near_copies)
+                                dev += conf->raid_disks - conf->near_copies;
+                        else
+                                dev -= conf->near_copies;
+                }
+                chunk = sector >> conf->chunk_shift;
+        }
        vchunk = chunk * conf->raid_disks + dev;
        sector_div(vchunk, conf->near_copies);
        return (vchunk << conf->chunk_shift) + offset;
@@ -900,9 +914,12 @@ static void status(struct seq_file *seq, mddev_t *mddev)
                seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
        if (conf->near_copies > 1)
                seq_printf(seq, " %d near-copies", conf->near_copies);
-        if (conf->far_copies > 1)
+        if (conf->far_copies > 1) {
-                seq_printf(seq, " %d far-copies", conf->far_copies);
+                if (conf->far_offset)
+                        seq_printf(seq, " %d offset-copies", conf->far_copies);
+                else
+                        seq_printf(seq, " %d far-copies", conf->far_copies);
+        }
        seq_printf(seq, " [%d/%d] [", conf->raid_disks,
                                                conf->working_disks);
        for (i = 0; i < conf->raid_disks; i++)
@@ -1475,6 +1492,10 @@ static void raid10d(mddev_t *mddev)
                                                                         s<<9, conf->tmppage, READ) == 0)
                                                                /* Well, this device is dead */
                                                                md_error(mddev, rdev);
+                                                        else
+                                                                printk(KERN_INFO "raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
+                                                                       mdname(mddev), s, (unsigned long long)(sect+rdev->data_offset), bdevname(rdev->bdev, b));
                                                        rdev_dec_pending(rdev, mddev);
                                                        rcu_read_lock();
                                                }
@@ -1915,7 +1936,7 @@ static int run(mddev_t *mddev)
        mirror_info_t *disk;
        mdk_rdev_t *rdev;
        struct list_head *tmp;
-        int nc, fc;
+        int nc, fc, fo;
        sector_t stride, size;
        if (mddev->chunk_size == 0) {
@@ -1925,8 +1946,9 @@ static int run(mddev_t *mddev)
        nc = mddev->layout & 255;
        fc = (mddev->layout >> 8) & 255;
+        fo = mddev->layout & (1<<16);
        if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
-            (mddev->layout >> 16)) {
+            (mddev->layout >> 17)) {
                printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
                       mdname(mddev), mddev->layout);
                goto out;
@@ -1958,12 +1980,16 @@ static int run(mddev_t *mddev)
        conf->near_copies = nc;
        conf->far_copies = fc;
        conf->copies = nc*fc;
+        conf->far_offset = fo;
        conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
        conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
-        stride = mddev->size >> (conf->chunk_shift-1);
+        if (fo)
-        sector_div(stride, fc);
+                conf->stride = 1 << conf->chunk_shift;
-        conf->stride = stride << conf->chunk_shift;
+        else {
+                stride = mddev->size >> (conf->chunk_shift-1);
+                sector_div(stride, fc);
+                conf->stride = stride << conf->chunk_shift;
+        }
        conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
                                                r10bio_pool_free, conf);
        if (!conf->r10bio_pool) {
@@ -2015,7 +2041,8 @@ static int run(mddev_t *mddev)
                disk = conf->mirrors + i;
-                if (!disk->rdev) {
+                if (!disk->rdev ||
+                    !test_bit(In_sync, &rdev->flags)) {
                        disk->head_position = 0;
                        mddev->degraded++;
                }
@@ -2037,7 +2064,13 @@ static int run(mddev_t *mddev)
        /*
         * Ok, everything is just fine now
         */
-        size = conf->stride * conf->raid_disks;
+        if (conf->far_offset) {
+                size = mddev->size >> (conf->chunk_shift-1);
+                size *= conf->raid_disks;
+                size <<= conf->chunk_shift;
+                sector_div(size, conf->far_copies);
+        } else
+                size = conf->stride * conf->raid_disks;
        sector_div(size, conf->near_copies);
        mddev->array_size = size/2;
        mddev->resync_max_sectors = size;
@@ -2050,7 +2083,7 @@ static int run(mddev_t *mddev)
         * maybe...
         */
        {
-                int stripe = conf->raid_disks * mddev->chunk_size / PAGE_SIZE;
+                int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE);
                stripe /= conf->near_copies;
                if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
                        mddev->queue->backing_dev_info.ra_pages = 2* stripe;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 31843604049c..450066007160 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2,8 +2,11 @@
 * raid5.c : Multiple Devices driver for Linux
 *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
 *         Copyright (C) 1999, 2000 Ingo Molnar
+ *         Copyright (C) 2002, 2003 H. Peter Anvin
 *
- * RAID-5 management functions.
+ * RAID-4/5/6 management functions.
+ * Thanks to Penguin Computing for making the RAID-6 development possible
+ * by donating a test server!
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -15,15 +18,38 @@
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
+/*
+ * BITMAP UNPLUGGING:
+ *
+ * The sequencing for updating the bitmap reliably is a little
+ * subtle (and I got it wrong the first time) so it deserves some
+ * explanation.
+ *
+ * We group bitmap updates into batches.  Each batch has a number.
+ * We may write out several batches at once, but that isn't very important.
+ * conf->bm_write is the number of the last batch successfully written.
+ * conf->bm_flush is the number of the last batch that was closed to
+ *    new additions.
+ * When we discover that we will need to write to any block in a stripe
+ * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
+ * the number of the batch it will be in. This is bm_flush+1.
+ * When we are ready to do a write, if that batch hasn't been written yet,
+ *   we plug the array and queue the stripe for later.
+ * When an unplug happens, we increment bm_flush, thus closing the current
+ *   batch.
+ * When we notice that bm_flush > bm_write, we write out all pending updates
+ * to the bitmap, and advance bm_write to where bm_flush was.
+ * This may occasionally write a bit out twice, but is sure never to
+ * miss any bits.
+ */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/raid/raid5.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
 #include <linux/kthread.h>
 #include <asm/atomic.h>
+#include "raid6.h"
 #include <linux/raid/bitmap.h>
@@ -68,6 +94,16 @@
 #define __inline__
 #endif
+#if !RAID6_USE_EMPTY_ZERO_PAGE
+/* In .bss so it's zeroed */
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+#endif
+static inline int raid6_next_disk(int disk, int raid_disks)
+{
+        disk++;
+        return (disk < raid_disks) ? disk : 0;
+}
 static void print_raid5_conf (raid5_conf_t *conf);
 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
@@ -76,12 +112,14 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
                BUG_ON(!list_empty(&sh->lru));
                BUG_ON(atomic_read(&conf->active_stripes)==0);
                if (test_bit(STRIPE_HANDLE, &sh->state)) {
-                        if (test_bit(STRIPE_DELAYED, &sh->state))
+                        if (test_bit(STRIPE_DELAYED, &sh->state)) {
                                list_add_tail(&sh->lru, &conf->delayed_list);
-                        else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+                                blk_plug_device(conf->mddev->queue);
-                                 conf->seq_write == sh->bm_seq)
+                        } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+                                   sh->bm_seq - conf->seq_write > 0) {
                                list_add_tail(&sh->lru, &conf->bitmap_list);
-                        else {
+                                blk_plug_device(conf->mddev->queue);
+                        } else {
                                clear_bit(STRIPE_BIT_DELAY, &sh->state);
                                list_add_tail(&sh->lru, &conf->handle_list);
                        }
@@ -104,7 +142,7 @@ static void release_stripe(struct stripe_head *sh)
 {
        raid5_conf_t *conf = sh->raid_conf;
        unsigned long flags;
-        
        spin_lock_irqsave(&conf->device_lock, flags);
        __release_stripe(conf, sh);
        spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -117,7 +155,7 @@ static inline void remove_hash(struct stripe_head *sh)
        hlist_del_init(&sh->hash);
 }
-static void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
+static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
 {
        struct hlist_head *hp = stripe_hash(conf, sh->sector);
@@ -190,7 +228,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
                (unsigned long long)sh->sector);
        remove_hash(sh);
-        
        sh->sector = sector;
        sh->pd_idx = pd_idx;
        sh->state = 0;
@@ -258,7 +296,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
                                                     < (conf->max_nr_stripes *3/4)
                                                     || !conf->inactive_blocked),
                                                    conf->device_lock,
-                                                    unplug_slaves(conf->mddev)
+                                                    raid5_unplug_device(conf->mddev->queue)
                                        );
                                conf->inactive_blocked = 0;
                        } else
@@ -269,8 +307,10 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
                        } else {
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
-                                if (!list_empty(&sh->lru))
+                                if (list_empty(&sh->lru) &&
-                                        list_del_init(&sh->lru);
+                                    !test_bit(STRIPE_EXPANDING, &sh->state))
+                                        BUG();
+                                list_del_init(&sh->lru);
                        }
                }
        } while (sh == NULL);
@@ -321,10 +361,9 @@ static int grow_stripes(raid5_conf_t *conf, int num)
                return 1;
        conf->slab_cache = sc;
        conf->pool_size = devs;
-        while (num--) {
+        while (num--)
                if (!grow_one_stripe(conf))
                        return 1;
-        }
        return 0;
 }
@@ -484,6 +523,8 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks, i;
        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+        char b[BDEVNAME_SIZE];
+        mdk_rdev_t *rdev;
        if (bi->bi_size)
                return 1;
@@ -531,25 +572,39 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
                set_bit(R5_UPTODATE, &sh->dev[i].flags);
 #endif
                if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-                        printk(KERN_INFO "raid5: read error corrected!!\n");
+                        rdev = conf->disks[i].rdev;
+                        printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
+                               mdname(conf->mddev), STRIPE_SECTORS,
+                               (unsigned long long)sh->sector + rdev->data_offset,
+                               bdevname(rdev->bdev, b));
                        clear_bit(R5_ReadError, &sh->dev[i].flags);
                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
                }
                if (atomic_read(&conf->disks[i].rdev->read_errors))
                        atomic_set(&conf->disks[i].rdev->read_errors, 0);
        } else {
+                const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
                int retry = 0;
+                rdev = conf->disks[i].rdev;
                clear_bit(R5_UPTODATE, &sh->dev[i].flags);
-                atomic_inc(&conf->disks[i].rdev->read_errors);
+                atomic_inc(&rdev->read_errors);
                if (conf->mddev->degraded)
-                        printk(KERN_WARNING "raid5: read error not correctable.\n");
+                        printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
+                               mdname(conf->mddev),
+                               (unsigned long long)sh->sector + rdev->data_offset,
+                               bdn);
                else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
                        /* Oh, no!!! */
-                        printk(KERN_WARNING "raid5: read error NOT corrected!!\n");
+                        printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
-                else if (atomic_read(&conf->disks[i].rdev->read_errors)
+                               mdname(conf->mddev),
+                               (unsigned long long)sh->sector + rdev->data_offset,
+                               bdn);
+                else if (atomic_read(&rdev->read_errors)
                         > conf->max_nr_stripes)
                        printk(KERN_WARNING
-                               "raid5: Too many read errors, failing device.\n");
+                               "raid5:%s: Too many read errors, failing device %s.\n",
+                               mdname(conf->mddev), bdn);
                else
                        retry = 1;
                if (retry)
@@ -557,7 +612,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
                else {
                        clear_bit(R5_ReadError, &sh->dev[i].flags);
                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
-                        md_error(conf->mddev, conf->disks[i].rdev);
+                        md_error(conf->mddev, rdev);
                }
        }
        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
@@ -631,8 +686,7 @@ static void raid5_build_block (struct stripe_head *sh, int i)
        dev->req.bi_private = sh;
        dev->flags = 0;
-        if (i != sh->pd_idx)
+        dev->sector = compute_blocknr(sh, i);
-                dev->sector = compute_blocknr(sh, i);
 }
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
@@ -659,7 +713,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
                        " Operation continuing on %d devices\n",
                        bdevname(rdev->bdev,b), conf->working_disks);
        }
-}       
+}
 /*
 * Input: a 'big' sector number,
@@ -697,9 +751,12 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
        /*
         * Select the parity disk based on the user selected algorithm.
         */
-        if (conf->level == 4)
+        switch(conf->level) {
+        case 4:
                *pd_idx = data_disks;
-        else switch (conf->algorithm) {
+                break;
+        case 5:
+                switch (conf->algorithm) {
                case ALGORITHM_LEFT_ASYMMETRIC:
                        *pd_idx = data_disks - stripe % raid_disks;
                        if (*dd_idx >= *pd_idx)
@@ -721,6 +778,39 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
                default:
                        printk(KERN_ERR "raid5: unsupported algorithm %d\n",
                                conf->algorithm);
+                }
+                break;
+        case 6:
+                /**** FIX THIS ****/
+                switch (conf->algorithm) {
+                case ALGORITHM_LEFT_ASYMMETRIC:
+                        *pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                        if (*pd_idx == raid_disks-1)
+                                (*dd_idx)++;    /* Q D D D P */
+                        else if (*dd_idx >= *pd_idx)
+                                (*dd_idx) += 2; /* D D P Q D */
+                        break;
+                case ALGORITHM_RIGHT_ASYMMETRIC:
+                        *pd_idx = stripe % raid_disks;
+                        if (*pd_idx == raid_disks-1)
+                                (*dd_idx)++;    /* Q D D D P */
+                        else if (*dd_idx >= *pd_idx)
+                                (*dd_idx) += 2; /* D D P Q D */
+                        break;
+                case ALGORITHM_LEFT_SYMMETRIC:
+                        *pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                        *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+                        break;
+                case ALGORITHM_RIGHT_SYMMETRIC:
+                        *pd_idx = stripe % raid_disks;
+                        *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+                        break;
+                default:
+                        printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
+                                conf->algorithm);
+                }
+                break;
        }
        /*
@@ -742,12 +832,17 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
        int chunk_number, dummy1, dummy2, dd_idx = i;
        sector_t r_sector;
        chunk_offset = sector_div(new_sector, sectors_per_chunk);
        stripe = new_sector;
        BUG_ON(new_sector != stripe);
-        
+        if (i == sh->pd_idx)
-        switch (conf->algorithm) {
+                return 0;
+        switch(conf->level) {
+        case 4: break;
+        case 5:
+                switch (conf->algorithm) {
                case ALGORITHM_LEFT_ASYMMETRIC:
                case ALGORITHM_RIGHT_ASYMMETRIC:
                        if (i > sh->pd_idx)
@@ -761,7 +856,37 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
                        break;
                default:
                        printk(KERN_ERR "raid5: unsupported algorithm %d\n",
+                               conf->algorithm);
+                }
+                break;
+        case 6:
+                data_disks = raid_disks - 2;
+                if (i == raid6_next_disk(sh->pd_idx, raid_disks))
+                        return 0; /* It is the Q disk */
+                switch (conf->algorithm) {
+                case ALGORITHM_LEFT_ASYMMETRIC:
+                case ALGORITHM_RIGHT_ASYMMETRIC:
+                        if (sh->pd_idx == raid_disks-1)
+                                i--;    /* Q D D D P */
+                        else if (i > sh->pd_idx)
+                                i -= 2; /* D D P Q D */
+                        break;
+                case ALGORITHM_LEFT_SYMMETRIC:
+                case ALGORITHM_RIGHT_SYMMETRIC:
+                        if (sh->pd_idx == raid_disks-1)
+                                i--; /* Q D D D P */
+                        else {
+                                /* D D P Q D */
+                                if (i < sh->pd_idx)
+                                        i += raid_disks;
+                                i -= (sh->pd_idx + 2);
+                        }
+                        break;
+                default:
+                        printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
                                conf->algorithm);
+                }
+                break;
        }
        chunk_number = stripe * data_disks + i;
@@ -778,10 +903,11 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
 /*
- * Copy data between a page in the stripe cache, and a bio.
+ * Copy data between a page in the stripe cache, and one or more bion
- * There are no alignment or size guarantees between the page or the
+ * The page could align with the middle of the bio, or there could be
- * bio except that there is some overlap.
+ * several bion, each with several bio_vecs, which cover part of the page
- * All iovecs in the bio must be considered.
+ * Multiple bion are linked together on bi_next.  There may be extras
+ * at the end of this list.  We ignore them.
 */
 static void copy_data(int frombio, struct bio *bio,
                     struct page *page,
@@ -810,7 +936,7 @@ static void copy_data(int frombio, struct bio *bio,
                if (len > 0 && page_offset + len > STRIPE_SIZE)
                        clen = STRIPE_SIZE - page_offset;
                else clen = len;
-                        
                if (clen > 0) {
                        char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
                        if (frombio)
@@ -862,14 +988,14 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
        set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 }
-static void compute_parity(struct stripe_head *sh, int method)
+static void compute_parity5(struct stripe_head *sh, int method)
 {
        raid5_conf_t *conf = sh->raid_conf;
        int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
        void *ptr[MAX_XOR_BLOCKS];
        struct bio *chosen;
-        PRINTK("compute_parity, stripe %llu, method %d\n",
+        PRINTK("compute_parity5, stripe %llu, method %d\n",
                (unsigned long long)sh->sector, method);
        count = 1;
@@ -956,9 +1082,195 @@ static void compute_parity(struct stripe_head *sh, int method)
                clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
 }
+static void compute_parity6(struct stripe_head *sh, int method)
+{
+        raid6_conf_t *conf = sh->raid_conf;
+        int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+        struct bio *chosen;
+        /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+        void *ptrs[disks];
+        qd_idx = raid6_next_disk(pd_idx, disks);
+        d0_idx = raid6_next_disk(qd_idx, disks);
+        PRINTK("compute_parity, stripe %llu, method %d\n",
+                (unsigned long long)sh->sector, method);
+        switch(method) {
+        case READ_MODIFY_WRITE:
+                BUG();          /* READ_MODIFY_WRITE N/A for RAID-6 */
+        case RECONSTRUCT_WRITE:
+                for (i= disks; i-- ;)
+                        if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
+                                chosen = sh->dev[i].towrite;
+                                sh->dev[i].towrite = NULL;
+                                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+                                        wake_up(&conf->wait_for_overlap);
+                                if (sh->dev[i].written) BUG();
+                                sh->dev[i].written = chosen;
+                        }
+                break;
+        case CHECK_PARITY:
+                BUG();          /* Not implemented yet */
+        }
+        for (i = disks; i--;)
+                if (sh->dev[i].written) {
+                        sector_t sector = sh->dev[i].sector;
+                        struct bio *wbi = sh->dev[i].written;
+                        while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+                                copy_data(1, wbi, sh->dev[i].page, sector);
+                                wbi = r5_next_bio(wbi, sector);
+                        }
+                        set_bit(R5_LOCKED, &sh->dev[i].flags);
+                        set_bit(R5_UPTODATE, &sh->dev[i].flags);
+                }
+//      switch(method) {
+//      case RECONSTRUCT_WRITE:
+//      case CHECK_PARITY:
+//      case UPDATE_PARITY:
+                /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
+                /* FIX: Is this ordering of drives even remotely optimal? */
+                count = 0;
+                i = d0_idx;
+                do {
+                        ptrs[count++] = page_address(sh->dev[i].page);
+                        if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
+                                printk("block %d/%d not uptodate on parity calc\n", i,count);
+                        i = raid6_next_disk(i, disks);
+                } while ( i != d0_idx );
+//              break;
+//      }
+        raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
+        switch(method) {
+        case RECONSTRUCT_WRITE:
+                set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+                set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
+                set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
+                set_bit(R5_LOCKED,   &sh->dev[qd_idx].flags);
+                break;
+        case UPDATE_PARITY:
+                set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+                set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
+                break;
+        }
+}
+/* Compute one missing block */
+static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
+{
+        raid6_conf_t *conf = sh->raid_conf;
+        int i, count, disks = conf->raid_disks;
+        void *ptr[MAX_XOR_BLOCKS], *p;
+        int pd_idx = sh->pd_idx;
+        int qd_idx = raid6_next_disk(pd_idx, disks);
+        PRINTK("compute_block_1, stripe %llu, idx %d\n",
+                (unsigned long long)sh->sector, dd_idx);
+        if ( dd_idx == qd_idx ) {
+                /* We're actually computing the Q drive */
+                compute_parity6(sh, UPDATE_PARITY);
+        } else {
+                ptr[0] = page_address(sh->dev[dd_idx].page);
+                if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
+                count = 1;
+                for (i = disks ; i--; ) {
+                        if (i == dd_idx || i == qd_idx)
+                                continue;
+                        p = page_address(sh->dev[i].page);
+                        if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
+                                ptr[count++] = p;
+                        else
+                                printk("compute_block() %d, stripe %llu, %d"
+                                       " not present\n", dd_idx,
+                                       (unsigned long long)sh->sector, i);
+                        check_xor();
+                }
+                if (count != 1)
+                        xor_block(count, STRIPE_SIZE, ptr);
+                if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
+                else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
+        }
+}
+/* Compute two missing blocks */
+static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
+{
+        raid6_conf_t *conf = sh->raid_conf;
+        int i, count, disks = conf->raid_disks;
+        int pd_idx = sh->pd_idx;
+        int qd_idx = raid6_next_disk(pd_idx, disks);
+        int d0_idx = raid6_next_disk(qd_idx, disks);
+        int faila, failb;
+        /* faila and failb are disk numbers relative to d0_idx */
+        /* pd_idx become disks-2 and qd_idx become disks-1 */
+        faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
+        failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
+        BUG_ON(faila == failb);
+        if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
+        PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
+               (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
+        if ( failb == disks-1 ) {
+                /* Q disk is one of the missing disks */
+                if ( faila == disks-2 ) {
+                        /* Missing P+Q, just recompute */
+                        compute_parity6(sh, UPDATE_PARITY);
+                        return;
+                } else {
+                        /* We're missing D+Q; recompute D from P */
+                        compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
+                        compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
+                        return;
+                }
+        }
+        /* We're missing D+P or D+D; build pointer table */
+        {
+                /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+                void *ptrs[disks];
+                count = 0;
+                i = d0_idx;
+                do {
+                        ptrs[count++] = page_address(sh->dev[i].page);
+                        i = raid6_next_disk(i, disks);
+                        if (i != dd_idx1 && i != dd_idx2 &&
+                            !test_bit(R5_UPTODATE, &sh->dev[i].flags))
+                                printk("compute_2 with missing block %d/%d\n", count, i);
+                } while ( i != d0_idx );
+                if ( failb == disks-2 ) {
+                        /* We're missing D+P. */
+                        raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
+                } else {
+                        /* We're missing D+D. */
+                        raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
+                }
+                /* Both the above update both missing blocks */
+                set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
+                set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
+        }
+}
 /*
 * Each stripe/dev can have one or more bion attached.
- * toread/towrite point to the first in a chain. 
+ * toread/towrite point to the first in a chain.
 * The bi_next chain must be in order.
 */
 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
@@ -1001,9 +1313,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                (unsigned long long)sh->sector, dd_idx);
        if (conf->mddev->bitmap && firstwrite) {
-                sh->bm_seq = conf->seq_write;
                bitmap_startwrite(conf->mddev->bitmap, sh->sector,
                                  STRIPE_SECTORS, 0);
+                sh->bm_seq = conf->seq_flush+1;
                set_bit(STRIPE_BIT_DELAY, &sh->state);
        }
@@ -1031,6 +1343,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 static void end_reshape(raid5_conf_t *conf);
+static int page_is_zero(struct page *p)
+{
+        char *a = page_address(p);
+        return ((*(u32*)a) == 0 &&
+                memcmp(a, a+4, STRIPE_SIZE-4)==0);
+}
 static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 {
        int sectors_per_chunk = conf->chunk_size >> 9;
@@ -1062,7 +1381,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 *
 */
 
-static void handle_stripe(struct stripe_head *sh)
+static void handle_stripe5(struct stripe_head *sh)
 {
        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks;
@@ -1394,7 +1713,7 @@ static void handle_stripe(struct stripe_head *sh)
                if (locked == 0 && (rcw == 0 ||rmw == 0) &&
                    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
                        PRINTK("Computing parity...\n");
-                        compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
+                        compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
                        /* now every locked buffer is ready to be written */
                        for (i=disks; i--;)
                                if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
@@ -1421,13 +1740,10 @@ static void handle_stripe(struct stripe_head *sh)
            !test_bit(STRIPE_INSYNC, &sh->state)) {
                set_bit(STRIPE_HANDLE, &sh->state);
                if (failed == 0) {
-                        char *pagea;
                        BUG_ON(uptodate != disks);
-                        compute_parity(sh, CHECK_PARITY);
+                        compute_parity5(sh, CHECK_PARITY);
                        uptodate--;
-                        pagea = page_address(sh->dev[sh->pd_idx].page);
+                        if (page_is_zero(sh->dev[sh->pd_idx].page)) {
-                        if ((*(u32*)pagea) == 0 &&
-                            !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
                                /* parity is correct (on disc, not in buffer any more) */
                                set_bit(STRIPE_INSYNC, &sh->state);
                        } else {
@@ -1487,7 +1803,7 @@ static void handle_stripe(struct stripe_head *sh)
                /* Need to write out all blocks after computing parity */
                sh->disks = conf->raid_disks;
                sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
-                compute_parity(sh, RECONSTRUCT_WRITE);
+                compute_parity5(sh, RECONSTRUCT_WRITE);
                for (i= conf->raid_disks; i--;) {
                        set_bit(R5_LOCKED, &sh->dev[i].flags);
                        locked++;
@@ -1615,6 +1931,569 @@ static void handle_stripe(struct stripe_head *sh)
        }
 }
+static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
+{
+        raid6_conf_t *conf = sh->raid_conf;
+        int disks = conf->raid_disks;
+        struct bio *return_bi= NULL;
+        struct bio *bi;
+        int i;
+        int syncing;
+        int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
+        int non_overwrite = 0;
+        int failed_num[2] = {0, 0};
+        struct r5dev *dev, *pdev, *qdev;
+        int pd_idx = sh->pd_idx;
+        int qd_idx = raid6_next_disk(pd_idx, disks);
+        int p_failed, q_failed;
+        PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
+               (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
+               pd_idx, qd_idx);
+        spin_lock(&sh->lock);
+        clear_bit(STRIPE_HANDLE, &sh->state);
+        clear_bit(STRIPE_DELAYED, &sh->state);
+        syncing = test_bit(STRIPE_SYNCING, &sh->state);
+        /* Now to look around and see what can be done */
+        rcu_read_lock();
+        for (i=disks; i--; ) {
+                mdk_rdev_t *rdev;
+                dev = &sh->dev[i];
+                clear_bit(R5_Insync, &dev->flags);
+                PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
+                        i, dev->flags, dev->toread, dev->towrite, dev->written);
+                /* maybe we can reply to a read */
+                if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
+                        struct bio *rbi, *rbi2;
+                        PRINTK("Return read for disc %d\n", i);
+                        spin_lock_irq(&conf->device_lock);
+                        rbi = dev->toread;
+                        dev->toread = NULL;
+                        if (test_and_clear_bit(R5_Overlap, &dev->flags))
+                                wake_up(&conf->wait_for_overlap);
+                        spin_unlock_irq(&conf->device_lock);
+                        while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+                                copy_data(0, rbi, dev->page, dev->sector);
+                                rbi2 = r5_next_bio(rbi, dev->sector);
+                                spin_lock_irq(&conf->device_lock);
+                                if (--rbi->bi_phys_segments == 0) {
+                                        rbi->bi_next = return_bi;
+                                        return_bi = rbi;
+                                }
+                                spin_unlock_irq(&conf->device_lock);
+                                rbi = rbi2;
+                        }
+                }
+                /* now count some things */
+                if (test_bit(R5_LOCKED, &dev->flags)) locked++;
+                if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+                if (dev->toread) to_read++;
+                if (dev->towrite) {
+                        to_write++;
+                        if (!test_bit(R5_OVERWRITE, &dev->flags))
+                                non_overwrite++;
+                }
+                if (dev->written) written++;
+                rdev = rcu_dereference(conf->disks[i].rdev);
+                if (!rdev || !test_bit(In_sync, &rdev->flags)) {
+                        /* The ReadError flag will just be confusing now */
+                        clear_bit(R5_ReadError, &dev->flags);
+                        clear_bit(R5_ReWrite, &dev->flags);
+                }
+                if (!rdev || !test_bit(In_sync, &rdev->flags)
+                    || test_bit(R5_ReadError, &dev->flags)) {
+                        if ( failed < 2 )
+                                failed_num[failed] = i;
+                        failed++;
+                } else
+                        set_bit(R5_Insync, &dev->flags);
+        }
+        rcu_read_unlock();
+        PRINTK("locked=%d uptodate=%d to_read=%d"
+               " to_write=%d failed=%d failed_num=%d,%d\n",
+               locked, uptodate, to_read, to_write, failed,
+               failed_num[0], failed_num[1]);
+        /* check if the array has lost >2 devices and, if so, some requests might
+         * need to be failed
+         */
+        if (failed > 2 && to_read+to_write+written) {
+                for (i=disks; i--; ) {
+                        int bitmap_end = 0;
+                        if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
+                                mdk_rdev_t *rdev;
+                                rcu_read_lock();
+                                rdev = rcu_dereference(conf->disks[i].rdev);
+                                if (rdev && test_bit(In_sync, &rdev->flags))
+                                        /* multiple read failures in one stripe */
+                                        md_error(conf->mddev, rdev);
+                                rcu_read_unlock();
+                        }
+                        spin_lock_irq(&conf->device_lock);
+                        /* fail all writes first */
+                        bi = sh->dev[i].towrite;
+                        sh->dev[i].towrite = NULL;
+                        if (bi) { to_write--; bitmap_end = 1; }
+                        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+                                wake_up(&conf->wait_for_overlap);
+                        while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+                                struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                                if (--bi->bi_phys_segments == 0) {
+                                        md_write_end(conf->mddev);
+                                        bi->bi_next = return_bi;
+                                        return_bi = bi;
+                                }
+                                bi = nextbi;
+                        }
+                        /* and fail all 'written' */
+                        bi = sh->dev[i].written;
+                        sh->dev[i].written = NULL;
+                        if (bi) bitmap_end = 1;
+                        while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
+                                struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
+                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                                if (--bi->bi_phys_segments == 0) {
+                                        md_write_end(conf->mddev);
+                                        bi->bi_next = return_bi;
+                                        return_bi = bi;
+                                }
+                                bi = bi2;
+                        }
+                        /* fail any reads if this device is non-operational */
+                        if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
+                            test_bit(R5_ReadError, &sh->dev[i].flags)) {
+                                bi = sh->dev[i].toread;
+                                sh->dev[i].toread = NULL;
+                                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+                                        wake_up(&conf->wait_for_overlap);
+                                if (bi) to_read--;
+                                while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+                                        struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+                                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                                        if (--bi->bi_phys_segments == 0) {
+                                                bi->bi_next = return_bi;
+                                                return_bi = bi;
+                                        }
+                                        bi = nextbi;
+                                }
+                        }
+                        spin_unlock_irq(&conf->device_lock);
+                        if (bitmap_end)
+                                bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+                                                STRIPE_SECTORS, 0, 0);
+                }
+        }
+        if (failed > 2 && syncing) {
+                md_done_sync(conf->mddev, STRIPE_SECTORS,0);
+                clear_bit(STRIPE_SYNCING, &sh->state);
+                syncing = 0;
+        }
+        /*
+         * might be able to return some write requests if the parity blocks
+         * are safe, or on a failed drive
+         */
+        pdev = &sh->dev[pd_idx];
+        p_failed = (failed >= 1 && failed_num[0] == pd_idx)
+                || (failed >= 2 && failed_num[1] == pd_idx);
+        qdev = &sh->dev[qd_idx];
+        q_failed = (failed >= 1 && failed_num[0] == qd_idx)
+                || (failed >= 2 && failed_num[1] == qd_idx);
+        if ( written &&
+             ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
+                             && !test_bit(R5_LOCKED, &pdev->flags)
+                             && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
+             ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
+                             && !test_bit(R5_LOCKED, &qdev->flags)
+                             && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
+                /* any written block on an uptodate or failed drive can be
+                 * returned.  Note that if we 'wrote' to a failed drive,
+                 * it will be UPTODATE, but never LOCKED, so we don't need
+                 * to test 'failed' directly.
+                 */
+                for (i=disks; i--; )
+                        if (sh->dev[i].written) {
+                                dev = &sh->dev[i];
+                                if (!test_bit(R5_LOCKED, &dev->flags) &&
+                                    test_bit(R5_UPTODATE, &dev->flags) ) {
+                                        /* We can return any write requests */
+                                        int bitmap_end = 0;
+                                        struct bio *wbi, *wbi2;
+                                        PRINTK("Return write for stripe %llu disc %d\n",
+                                               (unsigned long long)sh->sector, i);
+                                        spin_lock_irq(&conf->device_lock);
+                                        wbi = dev->written;
+                                        dev->written = NULL;
+                                        while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+                                                wbi2 = r5_next_bio(wbi, dev->sector);
+                                                if (--wbi->bi_phys_segments == 0) {
+                                                        md_write_end(conf->mddev);
+                                                        wbi->bi_next = return_bi;
+                                                        return_bi = wbi;
+                                                }
+                                                wbi = wbi2;
+                                        }
+                                        if (dev->towrite == NULL)
+                                                bitmap_end = 1;
+                                        spin_unlock_irq(&conf->device_lock);
+                                        if (bitmap_end)
+                                                bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+                                                                STRIPE_SECTORS,
+                                                                !test_bit(STRIPE_DEGRADED, &sh->state), 0);
+                                }
+                        }
+        }
+        /* Now we might consider reading some blocks, either to check/generate
+         * parity, or to satisfy requests
+         * or to load a block that is being partially written.
+         */
+        if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
+                for (i=disks; i--;) {
+                        dev = &sh->dev[i];
+                        if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+                            (dev->toread ||
+                             (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+                             syncing ||
+                             (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
+                             (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
+                                    )
+                                ) {
+                                /* we would like to get this block, possibly
+                                 * by computing it, but we might not be able to
+                                 */
+                                if (uptodate == disks-1) {
+                                        PRINTK("Computing stripe %llu block %d\n",
+                                               (unsigned long long)sh->sector, i);
+                                        compute_block_1(sh, i, 0);
+                                        uptodate++;
+                                } else if ( uptodate == disks-2 && failed >= 2 ) {
+                                        /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
+                                        int other;
+                                        for (other=disks; other--;) {
+                                                if ( other == i )
+                                                        continue;
+                                                if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
+                                                        break;
+                                        }
+                                        BUG_ON(other < 0);
+                                        PRINTK("Computing stripe %llu blocks %d,%d\n",
+                                               (unsigned long long)sh->sector, i, other);
+                                        compute_block_2(sh, i, other);
+                                        uptodate += 2;
+                                } else if (test_bit(R5_Insync, &dev->flags)) {
+                                        set_bit(R5_LOCKED, &dev->flags);
+                                        set_bit(R5_Wantread, &dev->flags);
+#if 0
+                                        /* if I am just reading this block and we don't have
+                                           a failed drive, or any pending writes then sidestep the cache */
+                                        if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
+                                            ! syncing && !failed && !to_write) {
+                                                sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
+                                                sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
+                                        }
+#endif
+                                        locked++;
+                                        PRINTK("Reading block %d (sync=%d)\n",
+                                                i, syncing);
+                                }
+                        }
+                }
+                set_bit(STRIPE_HANDLE, &sh->state);
+        }
+        /* now to consider writing and what else, if anything should be read */
+        if (to_write) {
+                int rcw=0, must_compute=0;
+                for (i=disks ; i--;) {
+                        dev = &sh->dev[i];
+                        /* Would I have to read this buffer for reconstruct_write */
+                        if (!test_bit(R5_OVERWRITE, &dev->flags)
+                            && i != pd_idx && i != qd_idx
+                            && (!test_bit(R5_LOCKED, &dev->flags)
+#if 0
+                                || sh->bh_page[i] != bh->b_page
+#endif
+                                    ) &&
+                            !test_bit(R5_UPTODATE, &dev->flags)) {
+                                if (test_bit(R5_Insync, &dev->flags)) rcw++;
+                                else {
+                                        PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
+                                        must_compute++;
+                                }
+                        }
+                }
+                PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
+                       (unsigned long long)sh->sector, rcw, must_compute);
+                set_bit(STRIPE_HANDLE, &sh->state);
+                if (rcw > 0)
+                        /* want reconstruct write, but need to get some data */
+                        for (i=disks; i--;) {
+                                dev = &sh->dev[i];
+                                if (!test_bit(R5_OVERWRITE, &dev->flags)
+                                    && !(failed == 0 && (i == pd_idx || i == qd_idx))
+                                    && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+                                    test_bit(R5_Insync, &dev->flags)) {
+                                        if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                                        {
+                                                PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
+                                                       (unsigned long long)sh->sector, i);
+                                                set_bit(R5_LOCKED, &dev->flags);
+                                                set_bit(R5_Wantread, &dev->flags);
+                                                locked++;
+                                        } else {
+                                                PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
+                                                       (unsigned long long)sh->sector, i);
+                                                set_bit(STRIPE_DELAYED, &sh->state);
+                                                set_bit(STRIPE_HANDLE, &sh->state);
+                                        }
+                                }
+                        }
+                /* now if nothing is locked, and if we have enough data, we can start a write request */
+                if (locked == 0 && rcw == 0 &&
+                    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
+                        if ( must_compute > 0 ) {
+                                /* We have failed blocks and need to compute them */
+                                switch ( failed ) {
+                                case 0: BUG();
+                                case 1: compute_block_1(sh, failed_num[0], 0); break;
+                                case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
+                                default: BUG(); /* This request should have been failed? */
+                                }
+                        }
+                        PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
+                        compute_parity6(sh, RECONSTRUCT_WRITE);
+                        /* now every locked buffer is ready to be written */
+                        for (i=disks; i--;)
+                                if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
+                                        PRINTK("Writing stripe %llu block %d\n",
+                                               (unsigned long long)sh->sector, i);
+                                        locked++;
+                                        set_bit(R5_Wantwrite, &sh->dev[i].flags);
+                                }
+                        /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
+                        set_bit(STRIPE_INSYNC, &sh->state);
+                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+                                atomic_dec(&conf->preread_active_stripes);
+                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+                                        md_wakeup_thread(conf->mddev->thread);
+                        }
+                }
+        }
+        /* maybe we need to check and possibly fix the parity for this stripe
+         * Any reads will already have been scheduled, so we just see if enough data
+         * is available
+         */
+        if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
+                int update_p = 0, update_q = 0;
+                struct r5dev *dev;
+                set_bit(STRIPE_HANDLE, &sh->state);
+                BUG_ON(failed>2);
+                BUG_ON(uptodate < disks);
+                /* Want to check and possibly repair P and Q.
+                 * However there could be one 'failed' device, in which
+                 * case we can only check one of them, possibly using the
+                 * other to generate missing data
+                 */
+                /* If !tmp_page, we cannot do the calculations,
+                 * but as we have set STRIPE_HANDLE, we will soon be called
+                 * by stripe_handle with a tmp_page - just wait until then.
+                 */
+                if (tmp_page) {
+                        if (failed == q_failed) {
+                                /* The only possible failed device holds 'Q', so it makes
+                                 * sense to check P (If anything else were failed, we would
+                                 * have used P to recreate it).
+                                 */
+                                compute_block_1(sh, pd_idx, 1);
+                                if (!page_is_zero(sh->dev[pd_idx].page)) {
+                                        compute_block_1(sh,pd_idx,0);
+                                        update_p = 1;
+                                }
+                        }
+                        if (!q_failed && failed < 2) {
+                                /* q is not failed, and we didn't use it to generate
+                                 * anything, so it makes sense to check it
+                                 */
+                                memcpy(page_address(tmp_page),
+                                       page_address(sh->dev[qd_idx].page),
+                                       STRIPE_SIZE);
+                                compute_parity6(sh, UPDATE_PARITY);
+                                if (memcmp(page_address(tmp_page),
+                                           page_address(sh->dev[qd_idx].page),
+                                           STRIPE_SIZE)!= 0) {
+                                        clear_bit(STRIPE_INSYNC, &sh->state);
+                                        update_q = 1;
+                                }
+                        }
+                        if (update_p || update_q) {
+                                conf->mddev->resync_mismatches += STRIPE_SECTORS;
+                                if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+                                        /* don't try to repair!! */
+                                        update_p = update_q = 0;
+                        }
+                        /* now write out any block on a failed drive,
+                         * or P or Q if they need it
+                         */
+                        if (failed == 2) {
+                                dev = &sh->dev[failed_num[1]];
+                                locked++;
+                                set_bit(R5_LOCKED, &dev->flags);
+                                set_bit(R5_Wantwrite, &dev->flags);
+                        }
+                        if (failed >= 1) {
+                                dev = &sh->dev[failed_num[0]];
+                                locked++;
+                                set_bit(R5_LOCKED, &dev->flags);
+                                set_bit(R5_Wantwrite, &dev->flags);
+                        }
+                        if (update_p) {
+                                dev = &sh->dev[pd_idx];
+                                locked ++;
+                                set_bit(R5_LOCKED, &dev->flags);
+                                set_bit(R5_Wantwrite, &dev->flags);
+                        }
+                        if (update_q) {
+                                dev = &sh->dev[qd_idx];
+                                locked++;
+                                set_bit(R5_LOCKED, &dev->flags);
+                                set_bit(R5_Wantwrite, &dev->flags);
+                        }
+                        clear_bit(STRIPE_DEGRADED, &sh->state);
+                        set_bit(STRIPE_INSYNC, &sh->state);
+                }
+        }
+        if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+                md_done_sync(conf->mddev, STRIPE_SECTORS,1);
+                clear_bit(STRIPE_SYNCING, &sh->state);
+        }
+        /* If the failed drives are just a ReadError, then we might need
+         * to progress the repair/check process
+         */
+        if (failed <= 2 && ! conf->mddev->ro)
+                for (i=0; i<failed;i++) {
+                        dev = &sh->dev[failed_num[i]];
+                        if (test_bit(R5_ReadError, &dev->flags)
+                            && !test_bit(R5_LOCKED, &dev->flags)
+                            && test_bit(R5_UPTODATE, &dev->flags)
+                                ) {
+                                if (!test_bit(R5_ReWrite, &dev->flags)) {
+                                        set_bit(R5_Wantwrite, &dev->flags);
+                                        set_bit(R5_ReWrite, &dev->flags);
+                                        set_bit(R5_LOCKED, &dev->flags);
+                                } else {
+                                        /* let's read it back */
+                                        set_bit(R5_Wantread, &dev->flags);
+                                        set_bit(R5_LOCKED, &dev->flags);
+                                }
+                        }
+                }
+        spin_unlock(&sh->lock);
+        while ((bi=return_bi)) {
+                int bytes = bi->bi_size;
+                return_bi = bi->bi_next;
+                bi->bi_next = NULL;
+                bi->bi_size = 0;
+                bi->bi_end_io(bi, bytes, 0);
+        }
+        for (i=disks; i-- ;) {
+                int rw;
+                struct bio *bi;
+                mdk_rdev_t *rdev;
+                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
+                        rw = 1;
+                else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+                        rw = 0;
+                else
+                        continue;
+                bi = &sh->dev[i].req;
+                bi->bi_rw = rw;
+                if (rw)
+                        bi->bi_end_io = raid5_end_write_request;
+                else
+                        bi->bi_end_io = raid5_end_read_request;
+                rcu_read_lock();
+                rdev = rcu_dereference(conf->disks[i].rdev);
+                if (rdev && test_bit(Faulty, &rdev->flags))
+                        rdev = NULL;
+                if (rdev)
+                        atomic_inc(&rdev->nr_pending);
+                rcu_read_unlock();
+                if (rdev) {
+                        if (syncing)
+                                md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+                        bi->bi_bdev = rdev->bdev;
+                        PRINTK("for %llu schedule op %ld on disc %d\n",
+                                (unsigned long long)sh->sector, bi->bi_rw, i);
+                        atomic_inc(&sh->count);
+                        bi->bi_sector = sh->sector + rdev->data_offset;
+                        bi->bi_flags = 1 << BIO_UPTODATE;
+                        bi->bi_vcnt = 1;
+                        bi->bi_max_vecs = 1;
+                        bi->bi_idx = 0;
+                        bi->bi_io_vec = &sh->dev[i].vec;
+                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+                        bi->bi_io_vec[0].bv_offset = 0;
+                        bi->bi_size = STRIPE_SIZE;
+                        bi->bi_next = NULL;
+                        if (rw == WRITE &&
+                            test_bit(R5_ReWrite, &sh->dev[i].flags))
+                                atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+                        generic_make_request(bi);
+                } else {
+                        if (rw == 1)
+                                set_bit(STRIPE_DEGRADED, &sh->state);
+                        PRINTK("skip op %ld on disc %d for sector %llu\n",
+                                bi->bi_rw, i, (unsigned long long)sh->sector);
+                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
+                        set_bit(STRIPE_HANDLE, &sh->state);
+                }
+        }
+}
+static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
+{
+        if (sh->raid_conf->level == 6)
+                handle_stripe6(sh, tmp_page);
+        else
+                handle_stripe5(sh);
+}
 static void raid5_activate_delayed(raid5_conf_t *conf)
 {
        if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
@@ -1718,13 +2597,6 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
        return ret;
 }
-static inline void raid5_plug_device(raid5_conf_t *conf)
-{
-        spin_lock_irq(&conf->device_lock);
-        blk_plug_device(conf->mddev->queue);
-        spin_unlock_irq(&conf->device_lock);
-}
 static int make_request(request_queue_t *q, struct bio * bi)
 {
        mddev_t *mddev = q->queuedata;
@@ -1753,7 +2625,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
                DEFINE_WAIT(w);
-                int disks;
+                int disks, data_disks;
        retry:
                prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
@@ -1781,7 +2653,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
                        }
                        spin_unlock_irq(&conf->device_lock);
                }
-                new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
+                data_disks = disks - conf->max_degraded;
+                new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
                                                  &dd_idx, &pd_idx, conf);
                PRINTK("raid5: make_request, sector %llu logical %llu\n",
                        (unsigned long long)new_sector, 
@@ -1832,8 +2706,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
                                goto retry;
                        }
                        finish_wait(&conf->wait_for_overlap, &w);
-                        raid5_plug_device(conf);
+                        handle_stripe(sh, NULL);
-                        handle_stripe(sh);
                        release_stripe(sh);
                } else {
                        /* cannot get stripe for read-ahead, just give-up */
@@ -1849,7 +2722,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
        if (remaining == 0) {
                int bytes = bi->bi_size;
-                if ( bio_data_dir(bi) == WRITE )
+                if ( rw == WRITE )
                        md_write_end(mddev);
                bi->bi_size = 0;
                bi->bi_end_io(bi, bytes, 0);
@@ -1857,17 +2730,141 @@ static int make_request(request_queue_t *q, struct bio * bi)
        return 0;
 }
-/* FIXME go_faster isn't used */
+static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
-static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
+        /* reshaping is quite different to recovery/resync so it is
+         * handled quite separately ... here.
+         *
+         * On each call to sync_request, we gather one chunk worth of
+         * destination stripes and flag them as expanding.
+         * Then we find all the source stripes and request reads.
+         * As the reads complete, handle_stripe will copy the data
+         * into the destination stripe and release that stripe.
+         */
        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
        struct stripe_head *sh;
        int pd_idx;
        sector_t first_sector, last_sector;
+        int raid_disks;
+        int data_disks;
+        int i;
+        int dd_idx;
+        sector_t writepos, safepos, gap;
+        if (sector_nr == 0 &&
+            conf->expand_progress != 0) {
+                /* restarting in the middle, skip the initial sectors */
+                sector_nr = conf->expand_progress;
+                sector_div(sector_nr, conf->raid_disks-1);
+                *skipped = 1;
+                return sector_nr;
+        }
+        /* we update the metadata when there is more than 3Meg
+         * in the block range (that is rather arbitrary, should
+         * probably be time based) or when the data about to be
+         * copied would over-write the source of the data at
+         * the front of the range.
+         * i.e. one new_stripe forward from expand_progress new_maps
+         * to after where expand_lo old_maps to
+         */
+        writepos = conf->expand_progress +
+                conf->chunk_size/512*(conf->raid_disks-1);
+        sector_div(writepos, conf->raid_disks-1);
+        safepos = conf->expand_lo;
+        sector_div(safepos, conf->previous_raid_disks-1);
+        gap = conf->expand_progress - conf->expand_lo;
+        if (writepos >= safepos ||
+            gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
+                /* Cannot proceed until we've updated the superblock... */
+                wait_event(conf->wait_for_overlap,
+                           atomic_read(&conf->reshape_stripes)==0);
+                mddev->reshape_position = conf->expand_progress;
+                mddev->sb_dirty = 1;
+                md_wakeup_thread(mddev->thread);
+                wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+                           kthread_should_stop());
+                spin_lock_irq(&conf->device_lock);
+                conf->expand_lo = mddev->reshape_position;
+                spin_unlock_irq(&conf->device_lock);
+                wake_up(&conf->wait_for_overlap);
+        }
+        for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
+                int j;
+                int skipped = 0;
+                pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
+                sh = get_active_stripe(conf, sector_nr+i,
+                                       conf->raid_disks, pd_idx, 0);
+                set_bit(STRIPE_EXPANDING, &sh->state);
+                atomic_inc(&conf->reshape_stripes);
+                /* If any of this stripe is beyond the end of the old
+                 * array, then we need to zero those blocks
+                 */
+                for (j=sh->disks; j--;) {
+                        sector_t s;
+                        if (j == sh->pd_idx)
+                                continue;
+                        s = compute_blocknr(sh, j);
+                        if (s < (mddev->array_size<<1)) {
+                                skipped = 1;
+                                continue;
+                        }
+                        memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
+                        set_bit(R5_Expanded, &sh->dev[j].flags);
+                        set_bit(R5_UPTODATE, &sh->dev[j].flags);
+                }
+                if (!skipped) {
+                        set_bit(STRIPE_EXPAND_READY, &sh->state);
+                        set_bit(STRIPE_HANDLE, &sh->state);
+                }
+                release_stripe(sh);
+        }
+        spin_lock_irq(&conf->device_lock);
+        conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
+        spin_unlock_irq(&conf->device_lock);
+        /* Ok, those stripe are ready. We can start scheduling
+         * reads on the source stripes.
+         * The source stripes are determined by mapping the first and last
+         * block on the destination stripes.
+         */
+        raid_disks = conf->previous_raid_disks;
+        data_disks = raid_disks - 1;
+        first_sector =
+                raid5_compute_sector(sector_nr*(conf->raid_disks-1),
+                                     raid_disks, data_disks,
+                                     &dd_idx, &pd_idx, conf);
+        last_sector =
+                raid5_compute_sector((sector_nr+conf->chunk_size/512)
+                                     *(conf->raid_disks-1) -1,
+                                     raid_disks, data_disks,
+                                     &dd_idx, &pd_idx, conf);
+        if (last_sector >= (mddev->size<<1))
+                last_sector = (mddev->size<<1)-1;
+        while (first_sector <= last_sector) {
+                pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
+                sh = get_active_stripe(conf, first_sector,
+                                       conf->previous_raid_disks, pd_idx, 0);
+                set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+                set_bit(STRIPE_HANDLE, &sh->state);
+                release_stripe(sh);
+                first_sector += STRIPE_SECTORS;
+        }
+        return conf->chunk_size>>9;
+}
+/* FIXME go_faster isn't used */
+static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
+{
+        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+        struct stripe_head *sh;
+        int pd_idx;
        int raid_disks = conf->raid_disks;
-        int data_disks = raid_disks-1;
        sector_t max_sector = mddev->size << 1;
        int sync_blocks;
+        int still_degraded = 0;
+        int i;
        if (sector_nr >= max_sector) {
                /* just being told to finish up .. nothing much to do */
@@ -1880,134 +2877,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                if (mddev->curr_resync < max_sector) /* aborted */
                        bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
                                        &sync_blocks, 1);
-                else /* compelted sync */
+                else /* completed sync */
                        conf->fullsync = 0;
                bitmap_close_sync(mddev->bitmap);
                return 0;
        }
-        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-                /* reshaping is quite different to recovery/resync so it is
+                return reshape_request(mddev, sector_nr, skipped);
-                 * handled quite separately ... here.
-                 *
+        /* if there is too many failed drives and we are trying
-                 * On each call to sync_request, we gather one chunk worth of
-                 * destination stripes and flag them as expanding.
-                 * Then we find all the source stripes and request reads.
-                 * As the reads complete, handle_stripe will copy the data
-                 * into the destination stripe and release that stripe.
-                 */
-                int i;
-                int dd_idx;
-                sector_t writepos, safepos, gap;
-                if (sector_nr == 0 &&
-                    conf->expand_progress != 0) {
-                        /* restarting in the middle, skip the initial sectors */
-                        sector_nr = conf->expand_progress;
-                        sector_div(sector_nr, conf->raid_disks-1);
-                        *skipped = 1;
-                        return sector_nr;
-                }
-                /* we update the metadata when there is more than 3Meg
-                 * in the block range (that is rather arbitrary, should
-                 * probably be time based) or when the data about to be
-                 * copied would over-write the source of the data at
-                 * the front of the range.
-                 * i.e. one new_stripe forward from expand_progress new_maps
-                 * to after where expand_lo old_maps to
-                 */
-                writepos = conf->expand_progress +
-                        conf->chunk_size/512*(conf->raid_disks-1);
-                sector_div(writepos, conf->raid_disks-1);
-                safepos = conf->expand_lo;
-                sector_div(safepos, conf->previous_raid_disks-1);
-                gap = conf->expand_progress - conf->expand_lo;
-                if (writepos >= safepos ||
-                    gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
-                        /* Cannot proceed until we've updated the superblock... */
-                        wait_event(conf->wait_for_overlap,
-                                   atomic_read(&conf->reshape_stripes)==0);
-                        mddev->reshape_position = conf->expand_progress;
-                        mddev->sb_dirty = 1;
-                        md_wakeup_thread(mddev->thread);
-                        wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
-                                   kthread_should_stop());
-                        spin_lock_irq(&conf->device_lock);
-                        conf->expand_lo = mddev->reshape_position;
-                        spin_unlock_irq(&conf->device_lock);
-                        wake_up(&conf->wait_for_overlap);
-                }
-                for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
-                        int j;
-                        int skipped = 0;
-                        pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
-                        sh = get_active_stripe(conf, sector_nr+i,
-                                               conf->raid_disks, pd_idx, 0);
-                        set_bit(STRIPE_EXPANDING, &sh->state);
-                        atomic_inc(&conf->reshape_stripes);
-                        /* If any of this stripe is beyond the end of the old
-                         * array, then we need to zero those blocks
-                         */
-                        for (j=sh->disks; j--;) {
-                                sector_t s;
-                                if (j == sh->pd_idx)
-                                        continue;
-                                s = compute_blocknr(sh, j);
-                                if (s < (mddev->array_size<<1)) {
-                                        skipped = 1;
-                                        continue;
-                                }
-                                memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
-                                set_bit(R5_Expanded, &sh->dev[j].flags);
-                                set_bit(R5_UPTODATE, &sh->dev[j].flags);
-                        }
-                        if (!skipped) {
-                                set_bit(STRIPE_EXPAND_READY, &sh->state);
-                                set_bit(STRIPE_HANDLE, &sh->state);
-                        }
-                        release_stripe(sh);
-                }
-                spin_lock_irq(&conf->device_lock);
-                conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
-                spin_unlock_irq(&conf->device_lock);
-                /* Ok, those stripe are ready. We can start scheduling
-                 * reads on the source stripes.
-                 * The source stripes are determined by mapping the first and last
-                 * block on the destination stripes.
-                 */
-                raid_disks = conf->previous_raid_disks;
-                data_disks = raid_disks - 1;
-                first_sector =
-                        raid5_compute_sector(sector_nr*(conf->raid_disks-1),
-                                             raid_disks, data_disks,
-                                             &dd_idx, &pd_idx, conf);
-                last_sector =
-                        raid5_compute_sector((sector_nr+conf->chunk_size/512)
-                                               *(conf->raid_disks-1) -1,
-                                             raid_disks, data_disks,
-                                             &dd_idx, &pd_idx, conf);
-                if (last_sector >= (mddev->size<<1))
-                        last_sector = (mddev->size<<1)-1;
-                while (first_sector <= last_sector) {
-                        pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
-                        sh = get_active_stripe(conf, first_sector,
-                                               conf->previous_raid_disks, pd_idx, 0);
-                        set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-                        set_bit(STRIPE_HANDLE, &sh->state);
-                        release_stripe(sh);
-                        first_sector += STRIPE_SECTORS;
-                }
-                return conf->chunk_size>>9;
-        }
-        /* if there is 1 or more failed drives and we are trying
         * to resync, then assert that we are finished, because there is
         * nothing we can do.
         */
-        if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+        if (mddev->degraded >= conf->max_degraded &&
+            test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
                sector_t rv = (mddev->size << 1) - sector_nr;
                *skipped = 1;
                return rv;
@@ -2026,17 +2911,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
        if (sh == NULL) {
                sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
                /* make sure we don't swamp the stripe cache if someone else
-                 * is trying to get access 
+                 * is trying to get access
                 */
                schedule_timeout_uninterruptible(1);
        }
-        bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
+        /* Need to check if array will still be degraded after recovery/resync
-        spin_lock(&sh->lock);   
+         * We don't need to check the 'failed' flag as when that gets set,
+         * recovery aborts.
+         */
+        for (i=0; i<mddev->raid_disks; i++)
+                if (conf->disks[i].rdev == NULL)
+                        still_degraded = 1;
+        bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
+        spin_lock(&sh->lock);
        set_bit(STRIPE_SYNCING, &sh->state);
        clear_bit(STRIPE_INSYNC, &sh->state);
        spin_unlock(&sh->lock);
-        handle_stripe(sh);
+        handle_stripe(sh, NULL);
        release_stripe(sh);
        return STRIPE_SECTORS;
@@ -2064,7 +2958,7 @@ static void raid5d (mddev_t *mddev)
        while (1) {
                struct list_head *first;
-                if (conf->seq_flush - conf->seq_write > 0) {
+                if (conf->seq_flush != conf->seq_write) {
                        int seq = conf->seq_flush;
                        spin_unlock_irq(&conf->device_lock);
                        bitmap_unplug(mddev->bitmap);
@@ -2091,7 +2985,7 @@ static void raid5d (mddev_t *mddev)
                spin_unlock_irq(&conf->device_lock);
                
                handled++;
-                handle_stripe(sh);
+                handle_stripe(sh, conf->spare_page);
                release_stripe(sh);
                spin_lock_irq(&conf->device_lock);
@@ -2181,8 +3075,8 @@ static int run(mddev_t *mddev)
        struct disk_info *disk;
        struct list_head *tmp;
-        if (mddev->level != 5 && mddev->level != 4) {
+        if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
-                printk(KERN_ERR "raid5: %s: raid level not set to 4/5 (%d)\n",
+                printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
                       mdname(mddev), mddev->level);
                return -EIO;
        }
@@ -2251,6 +3145,11 @@ static int run(mddev_t *mddev)
        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
                goto abort;
+        if (mddev->level == 6) {
+                conf->spare_page = alloc_page(GFP_KERNEL);
+                if (!conf->spare_page)
+                        goto abort;
+        }
        spin_lock_init(&conf->device_lock);
        init_waitqueue_head(&conf->wait_for_stripe);
        init_waitqueue_head(&conf->wait_for_overlap);
@@ -2282,12 +3181,16 @@ static int run(mddev_t *mddev)
        }
        /*
-         * 0 for a fully functional array, 1 for a degraded array.
+         * 0 for a fully functional array, 1 or 2 for a degraded array.
         */
        mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
        conf->mddev = mddev;
        conf->chunk_size = mddev->chunk_size;
        conf->level = mddev->level;
+        if (conf->level == 6)
+                conf->max_degraded = 2;
+        else
+                conf->max_degraded = 1;
        conf->algorithm = mddev->layout;
        conf->max_nr_stripes = NR_STRIPES;
        conf->expand_progress = mddev->reshape_position;
@@ -2296,6 +3199,11 @@ static int run(mddev_t *mddev)
        mddev->size &= ~(mddev->chunk_size/1024 -1);
        mddev->resync_max_sectors = mddev->size << 1;
+        if (conf->level == 6 && conf->raid_disks < 4) {
+                printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
+                       mdname(mddev), conf->raid_disks);
+                goto abort;
+        }
        if (!conf->chunk_size || conf->chunk_size % 4) {
                printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
                        conf->chunk_size, mdname(mddev));
@@ -2307,14 +3215,14 @@ static int run(mddev_t *mddev)
                        conf->algorithm, mdname(mddev));
                goto abort;
        }
-        if (mddev->degraded > 1) {
+        if (mddev->degraded > conf->max_degraded) {
                printk(KERN_ERR "raid5: not enough operational devices for %s"
                        " (%d/%d failed)\n",
                        mdname(mddev), conf->failed_disks, conf->raid_disks);
                goto abort;
        }
-        if (mddev->degraded == 1 &&
+        if (mddev->degraded > 0 &&
            mddev->recovery_cp != MaxSector) {
                if (mddev->ok_start_degraded)
                        printk(KERN_WARNING
@@ -2373,17 +3281,15 @@ static int run(mddev_t *mddev)
                set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
                mddev->sync_thread = md_register_thread(md_do_sync, mddev,
                                                        "%s_reshape");
-                /* FIXME if md_register_thread fails?? */
-                md_wakeup_thread(mddev->sync_thread);
        }
        /* read-ahead size must cover two whole stripes, which is
-         * 2 * (n-1) * chunksize where 'n' is the number of raid devices
+         * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
         */
        {
-                int stripe = (mddev->raid_disks-1) * mddev->chunk_size
+                int data_disks = conf->previous_raid_disks - conf->max_degraded;
-                        / PAGE_SIZE;
+                int stripe = data_disks *
+                        (mddev->chunk_size / PAGE_SIZE);
                if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
                        mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
        }
@@ -2393,12 +3299,14 @@ static int run(mddev_t *mddev)
        mddev->queue->unplug_fn = raid5_unplug_device;
        mddev->queue->issue_flush_fn = raid5_issue_flush;
-        mddev->array_size =  mddev->size * (conf->previous_raid_disks - 1);
+        mddev->array_size =  mddev->size * (conf->previous_raid_disks -
+                                            conf->max_degraded);
        return 0;
 abort:
        if (conf) {
                print_raid5_conf(conf);
+                safe_put_page(conf->spare_page);
                kfree(conf->disks);
                kfree(conf->stripe_hashtbl);
                kfree(conf);
@@ -2427,23 +3335,23 @@ static int stop(mddev_t *mddev)
 }
 #if RAID5_DEBUG
-static void print_sh (struct stripe_head *sh)
+static void print_sh (struct seq_file *seq, struct stripe_head *sh)
 {
        int i;
-        printk("sh %llu, pd_idx %d, state %ld.\n",
+        seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
-                (unsigned long long)sh->sector, sh->pd_idx, sh->state);
+                   (unsigned long long)sh->sector, sh->pd_idx, sh->state);
-        printk("sh %llu,  count %d.\n",
+        seq_printf(seq, "sh %llu,  count %d.\n",
-                (unsigned long long)sh->sector, atomic_read(&sh->count));
+                   (unsigned long long)sh->sector, atomic_read(&sh->count));
-        printk("sh %llu, ", (unsigned long long)sh->sector);
+        seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
        for (i = 0; i < sh->disks; i++) {
-                printk("(cache%d: %p %ld) ", 
+                seq_printf(seq, "(cache%d: %p %ld) ",
-                        i, sh->dev[i].page, sh->dev[i].flags);
+                           i, sh->dev[i].page, sh->dev[i].flags);
        }
-        printk("\n");
+        seq_printf(seq, "\n");
 }
-static void printall (raid5_conf_t *conf)
+static void printall (struct seq_file *seq, raid5_conf_t *conf)
 {
        struct stripe_head *sh;
        struct hlist_node *hn;
@@ -2454,7 +3362,7 @@ static void printall (raid5_conf_t *conf)
                hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
                        if (sh->raid_conf != conf)
                                continue;
-                        print_sh(sh);
+                        print_sh(seq, sh);
                }
        }
        spin_unlock_irq(&conf->device_lock);
@@ -2474,9 +3382,8 @@ static void status (struct seq_file *seq, mddev_t *mddev)
                               test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
        seq_printf (seq, "]");
 #if RAID5_DEBUG
-#define D(x) \
+        seq_printf (seq, "\n");
-        seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
+        printall(seq, conf);
-        printall(conf);
 #endif
 }
@@ -2560,14 +3467,20 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
        int disk;
        struct disk_info *p;
-        if (mddev->degraded > 1)
+        if (mddev->degraded > conf->max_degraded)
                /* no point adding a device */
                return 0;
        /*
-         * find the disk ...
+         * find the disk ... but prefer rdev->saved_raid_disk
+         * if possible.
         */
-        for (disk=0; disk < conf->raid_disks; disk++)
+        if (rdev->saved_raid_disk >= 0 &&
+            conf->disks[rdev->saved_raid_disk].rdev == NULL)
+                disk = rdev->saved_raid_disk;
+        else
+                disk = 0;
+        for ( ; disk < conf->raid_disks; disk++)
                if ((p=conf->disks + disk)->rdev == NULL) {
                        clear_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = disk;
@@ -2590,8 +3503,10 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
         * any io in the removed space completes, but it hardly seems
         * worth it.
         */
+        raid5_conf_t *conf = mddev_to_conf(mddev);
        sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
-        mddev->array_size = (sectors * (mddev->raid_disks-1))>>1;
+        mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
        set_capacity(mddev->gendisk, mddev->array_size << 1);
        mddev->changed = 1;
        if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
@@ -2680,6 +3595,7 @@ static int raid5_start_reshape(mddev_t *mddev)
                                set_bit(In_sync, &rdev->flags);
                                conf->working_disks++;
                                added_devices++;
+                                rdev->recovery_offset = 0;
                                sprintf(nm, "rd%d", rdev->raid_disk);
                                sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
                        } else
@@ -2731,6 +3647,17 @@ static void end_reshape(raid5_conf_t *conf)
                conf->expand_progress = MaxSector;
                spin_unlock_irq(&conf->device_lock);
                conf->mddev->reshape_position = MaxSector;
+                /* read-ahead size must cover two whole stripes, which is
+                 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
+                 */
+                {
+                        int data_disks = conf->previous_raid_disks - conf->max_degraded;
+                        int stripe = data_disks *
+                                (conf->mddev->chunk_size / PAGE_SIZE);
+                        if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+                                conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+                }
        }
 }
@@ -2762,6 +3689,23 @@ static void raid5_quiesce(mddev_t *mddev, int state)
        }
 }
+static struct mdk_personality raid6_personality =
+{
+        .name           = "raid6",
+        .level          = 6,
+        .owner          = THIS_MODULE,
+        .make_request   = make_request,
+        .run            = run,
+        .stop           = stop,
+        .status         = status,
+        .error_handler  = error,
+        .hot_add_disk   = raid5_add_disk,
+        .hot_remove_disk= raid5_remove_disk,
+        .spare_active   = raid5_spare_active,
+        .sync_request   = sync_request,
+        .resize         = raid5_resize,
+        .quiesce        = raid5_quiesce,
+};
 static struct mdk_personality raid5_personality =
 {
        .name           = "raid5",
@@ -2804,6 +3748,12 @@ static struct mdk_personality raid4_personality =
 static int __init raid5_init(void)
 {
+        int e;
+        e = raid6_select_algo();
+        if ( e )
+                return e;
+        register_md_personality(&raid6_personality);
        register_md_personality(&raid5_personality);
        register_md_personality(&raid4_personality);
        return 0;
@@ -2811,6 +3761,7 @@ static int __init raid5_init(void)
 static void raid5_exit(void)
 {
+        unregister_md_personality(&raid6_personality);
        unregister_md_personality(&raid5_personality);
        unregister_md_personality(&raid4_personality);
 }
@@ -2823,3 +3774,10 @@ MODULE_ALIAS("md-raid5");
 MODULE_ALIAS("md-raid4");
 MODULE_ALIAS("md-level-5");
 MODULE_ALIAS("md-level-4");
+MODULE_ALIAS("md-personality-8"); /* RAID6 */
+MODULE_ALIAS("md-raid6");
+MODULE_ALIAS("md-level-6");
+/* This used to be two separate modules, they were: */
+MODULE_ALIAS("raid5");
+MODULE_ALIAS("raid6");
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
deleted file mode 100644
index bc69355e0100..000000000000
--- a/drivers/md/raid6main.c
+++ /dev/null
@@ -1,2427 +0,0 @@
-/*
- * raid6main.c : Multiple Devices driver for Linux
- *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
- *         Copyright (C) 1999, 2000 Ingo Molnar
- *         Copyright (C) 2002, 2003 H. Peter Anvin
- *
- * RAID-6 management functions.  This code is derived from raid5.c.
- * Last merge from raid5.c bkcvs version 1.79 (kernel 2.6.1).
- *
- * Thanks to Penguin Computing for making the RAID-6 development possible
- * by donating a test server!
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/bitops.h>
-#include <asm/atomic.h>
-#include "raid6.h"
-#include <linux/raid/bitmap.h>
-/*
- * Stripe cache
- */
-#define NR_STRIPES              256
-#define STRIPE_SIZE             PAGE_SIZE
-#define STRIPE_SHIFT            (PAGE_SHIFT - 9)
-#define STRIPE_SECTORS          (STRIPE_SIZE>>9)
-#define IO_THRESHOLD            1
-#define NR_HASH                 (PAGE_SIZE / sizeof(struct hlist_head))
-#define HASH_MASK               (NR_HASH - 1)
-#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
-/* bio's attached to a stripe+device for I/O are linked together in bi_sector
- * order without overlap.  There may be several bio's per stripe+device, and
- * a bio could span several devices.
- * When walking this list for a particular stripe+device, we must never proceed
- * beyond a bio that extends past this device, as the next bio might no longer
- * be valid.
- * This macro is used to determine the 'next' bio in the list, given the sector
- * of the current stripe+device
- */
-#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
-/*
- * The following can be used to debug the driver
- */
-#define RAID6_DEBUG     0       /* Extremely verbose printk */
-#define RAID6_PARANOIA  1       /* Check spinlocks */
-#define RAID6_DUMPSTATE 0       /* Include stripe cache state in /proc/mdstat */
-#if RAID6_PARANOIA && defined(CONFIG_SMP)
-# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
-#else
-# define CHECK_DEVLOCK()
-#endif
-#define PRINTK(x...) ((void)(RAID6_DEBUG && printk(KERN_DEBUG x)))
-#if RAID6_DEBUG
-#undef inline
-#undef __inline__
-#define inline
-#define __inline__
-#endif
-#if !RAID6_USE_EMPTY_ZERO_PAGE
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-#endif
-static inline int raid6_next_disk(int disk, int raid_disks)
-{
-        disk++;
-        return (disk < raid_disks) ? disk : 0;
-}
-static void print_raid6_conf (raid6_conf_t *conf);
-static void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh)
-{
-        if (atomic_dec_and_test(&sh->count)) {
-                BUG_ON(!list_empty(&sh->lru));
-                BUG_ON(atomic_read(&conf->active_stripes)==0);
-                if (test_bit(STRIPE_HANDLE, &sh->state)) {
-                        if (test_bit(STRIPE_DELAYED, &sh->state))
-                                list_add_tail(&sh->lru, &conf->delayed_list);
-                        else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
-                                 conf->seq_write == sh->bm_seq)
-                                list_add_tail(&sh->lru, &conf->bitmap_list);
-                        else {
-                                clear_bit(STRIPE_BIT_DELAY, &sh->state);
-                                list_add_tail(&sh->lru, &conf->handle_list);
-                        }
-                        md_wakeup_thread(conf->mddev->thread);
-                } else {
-                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-                                atomic_dec(&conf->preread_active_stripes);
-                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-                                        md_wakeup_thread(conf->mddev->thread);
-                        }
-                        list_add_tail(&sh->lru, &conf->inactive_list);
-                        atomic_dec(&conf->active_stripes);
-                        if (!conf->inactive_blocked ||
-                            atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4))
-                                wake_up(&conf->wait_for_stripe);
-                }
-        }
-}
-static void release_stripe(struct stripe_head *sh)
-{
-        raid6_conf_t *conf = sh->raid_conf;
-        unsigned long flags;
-        spin_lock_irqsave(&conf->device_lock, flags);
-        __release_stripe(conf, sh);
-        spin_unlock_irqrestore(&conf->device_lock, flags);
-}
-static inline void remove_hash(struct stripe_head *sh)
-{
-        PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
-        hlist_del_init(&sh->hash);
-}
-static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
-{
-        struct hlist_head *hp = stripe_hash(conf, sh->sector);
-        PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
-        CHECK_DEVLOCK();
-        hlist_add_head(&sh->hash, hp);
-}
-/* find an idle stripe, make sure it is unhashed, and return it. */
-static struct stripe_head *get_free_stripe(raid6_conf_t *conf)
-{
-        struct stripe_head *sh = NULL;
-        struct list_head *first;
-        CHECK_DEVLOCK();
-        if (list_empty(&conf->inactive_list))
-                goto out;
-        first = conf->inactive_list.next;
-        sh = list_entry(first, struct stripe_head, lru);
-        list_del_init(first);
-        remove_hash(sh);
-        atomic_inc(&conf->active_stripes);
-out:
-        return sh;
-}
-static void shrink_buffers(struct stripe_head *sh, int num)
-{
-        struct page *p;
-        int i;
-        for (i=0; i<num ; i++) {
-                p = sh->dev[i].page;
-                if (!p)
-                        continue;
-                sh->dev[i].page = NULL;
-                put_page(p);
-        }
-}
-static int grow_buffers(struct stripe_head *sh, int num)
-{
-        int i;
-        for (i=0; i<num; i++) {
-                struct page *page;
-                if (!(page = alloc_page(GFP_KERNEL))) {
-                        return 1;
-                }
-                sh->dev[i].page = page;
-        }
-        return 0;
-}
-static void raid6_build_block (struct stripe_head *sh, int i);
-static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
-{
-        raid6_conf_t *conf = sh->raid_conf;
-        int disks = conf->raid_disks, i;
-        BUG_ON(atomic_read(&sh->count) != 0);
-        BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
-        CHECK_DEVLOCK();
-        PRINTK("init_stripe called, stripe %llu\n",
-                (unsigned long long)sh->sector);
-        remove_hash(sh);
-        sh->sector = sector;
-        sh->pd_idx = pd_idx;
-        sh->state = 0;
-        for (i=disks; i--; ) {
-                struct r5dev *dev = &sh->dev[i];
-                if (dev->toread || dev->towrite || dev->written ||
-                    test_bit(R5_LOCKED, &dev->flags)) {
-                        PRINTK("sector=%llx i=%d %p %p %p %d\n",
-                               (unsigned long long)sh->sector, i, dev->toread,
-                               dev->towrite, dev->written,
-                               test_bit(R5_LOCKED, &dev->flags));
-                        BUG();
-                }
-                dev->flags = 0;
-                raid6_build_block(sh, i);
-        }
-        insert_hash(conf, sh);
-}
-static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
-{
-        struct stripe_head *sh;
-        struct hlist_node *hn;
-        CHECK_DEVLOCK();
-        PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
-        hlist_for_each_entry (sh, hn,  stripe_hash(conf, sector), hash)
-                if (sh->sector == sector)
-                        return sh;
-        PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
-        return NULL;
-}
-static void unplug_slaves(mddev_t *mddev);
-static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector,
-                                             int pd_idx, int noblock)
-{
-        struct stripe_head *sh;
-        PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
-        spin_lock_irq(&conf->device_lock);
-        do {
-                wait_event_lock_irq(conf->wait_for_stripe,
-                                    conf->quiesce == 0,
-                                    conf->device_lock, /* nothing */);
-                sh = __find_stripe(conf, sector);
-                if (!sh) {
-                        if (!conf->inactive_blocked)
-                                sh = get_free_stripe(conf);
-                        if (noblock && sh == NULL)
-                                break;
-                        if (!sh) {
-                                conf->inactive_blocked = 1;
-                                wait_event_lock_irq(conf->wait_for_stripe,
-                                                    !list_empty(&conf->inactive_list) &&
-                                                    (atomic_read(&conf->active_stripes)
-                                                     < (conf->max_nr_stripes *3/4)
-                                                     || !conf->inactive_blocked),
-                                                    conf->device_lock,
-                                                    unplug_slaves(conf->mddev);
-                                        );
-                                conf->inactive_blocked = 0;
-                        } else
-                                init_stripe(sh, sector, pd_idx);
-                } else {
-                        if (atomic_read(&sh->count)) {
-                                BUG_ON(!list_empty(&sh->lru));
-                        } else {
-                                if (!test_bit(STRIPE_HANDLE, &sh->state))
-                                        atomic_inc(&conf->active_stripes);
-                                BUG_ON(list_empty(&sh->lru));
-                                list_del_init(&sh->lru);
-                        }
-                }
-        } while (sh == NULL);
-        if (sh)
-                atomic_inc(&sh->count);
-        spin_unlock_irq(&conf->device_lock);
-        return sh;
-}
-static int grow_one_stripe(raid6_conf_t *conf)
-{
-        struct stripe_head *sh;
-        sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
-        if (!sh)
-                return 0;
-        memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
-        sh->raid_conf = conf;
-        spin_lock_init(&sh->lock);
-        if (grow_buffers(sh, conf->raid_disks)) {
-                shrink_buffers(sh, conf->raid_disks);
-                kmem_cache_free(conf->slab_cache, sh);
-                return 0;
-        }
-        /* we just created an active stripe so... */
-        atomic_set(&sh->count, 1);
-        atomic_inc(&conf->active_stripes);
-        INIT_LIST_HEAD(&sh->lru);
-        release_stripe(sh);
-        return 1;
-}
-static int grow_stripes(raid6_conf_t *conf, int num)
-{
-        kmem_cache_t *sc;
-        int devs = conf->raid_disks;
-        sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
-        sc = kmem_cache_create(conf->cache_name[0],
-                               sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
-                               0, 0, NULL, NULL);
-        if (!sc)
-                return 1;
-        conf->slab_cache = sc;
-        while (num--)
-                if (!grow_one_stripe(conf))
-                        return 1;
-        return 0;
-}
-static int drop_one_stripe(raid6_conf_t *conf)
-{
-        struct stripe_head *sh;
-        spin_lock_irq(&conf->device_lock);
-        sh = get_free_stripe(conf);
-        spin_unlock_irq(&conf->device_lock);
-        if (!sh)
-                return 0;
-        BUG_ON(atomic_read(&sh->count));
-        shrink_buffers(sh, conf->raid_disks);
-        kmem_cache_free(conf->slab_cache, sh);
-        atomic_dec(&conf->active_stripes);
-        return 1;
-}
-static void shrink_stripes(raid6_conf_t *conf)
-{
-        while (drop_one_stripe(conf))
-                ;
-        if (conf->slab_cache)
-                kmem_cache_destroy(conf->slab_cache);
-        conf->slab_cache = NULL;
-}
-static int raid6_end_read_request(struct bio * bi, unsigned int bytes_done,
-                                  int error)
-{
-        struct stripe_head *sh = bi->bi_private;
-        raid6_conf_t *conf = sh->raid_conf;
-        int disks = conf->raid_disks, i;
-        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
-        if (bi->bi_size)
-                return 1;
-        for (i=0 ; i<disks; i++)
-                if (bi == &sh->dev[i].req)
-                        break;
-        PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
-                (unsigned long long)sh->sector, i, atomic_read(&sh->count),
-                uptodate);
-        if (i == disks) {
-                BUG();
-                return 0;
-        }
-        if (uptodate) {
-#if 0
-                struct bio *bio;
-                unsigned long flags;
-                spin_lock_irqsave(&conf->device_lock, flags);
-                /* we can return a buffer if we bypassed the cache or
-                 * if the top buffer is not in highmem.  If there are
-                 * multiple buffers, leave the extra work to
-                 * handle_stripe
-                 */
-                buffer = sh->bh_read[i];
-                if (buffer &&
-                    (!PageHighMem(buffer->b_page)
-                     || buffer->b_page == bh->b_page )
-                        ) {
-                        sh->bh_read[i] = buffer->b_reqnext;
-                        buffer->b_reqnext = NULL;
-                } else
-                        buffer = NULL;
-                spin_unlock_irqrestore(&conf->device_lock, flags);
-                if (sh->bh_page[i]==bh->b_page)
-                        set_buffer_uptodate(bh);
-                if (buffer) {
-                        if (buffer->b_page != bh->b_page)
-                                memcpy(buffer->b_data, bh->b_data, bh->b_size);
-                        buffer->b_end_io(buffer, 1);
-                }
-#else
-                set_bit(R5_UPTODATE, &sh->dev[i].flags);
-#endif
-                if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-                        printk(KERN_INFO "raid6: read error corrected!!\n");
-                        clear_bit(R5_ReadError, &sh->dev[i].flags);
-                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
-                }
-                if (atomic_read(&conf->disks[i].rdev->read_errors))
-                        atomic_set(&conf->disks[i].rdev->read_errors, 0);
-        } else {
-                int retry = 0;
-                clear_bit(R5_UPTODATE, &sh->dev[i].flags);
-                atomic_inc(&conf->disks[i].rdev->read_errors);
-                if (conf->mddev->degraded)
-                        printk(KERN_WARNING "raid6: read error not correctable.\n");
-                else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
-                        /* Oh, no!!! */
-                        printk(KERN_WARNING "raid6: read error NOT corrected!!\n");
-                else if (atomic_read(&conf->disks[i].rdev->read_errors)
-                         > conf->max_nr_stripes)
-                        printk(KERN_WARNING
-                               "raid6: Too many read errors, failing device.\n");
-                else
-                        retry = 1;
-                if (retry)
-                        set_bit(R5_ReadError, &sh->dev[i].flags);
-                else {
-                        clear_bit(R5_ReadError, &sh->dev[i].flags);
-                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
-                        md_error(conf->mddev, conf->disks[i].rdev);
-                }
-        }
-        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-#if 0
-        /* must restore b_page before unlocking buffer... */
-        if (sh->bh_page[i] != bh->b_page) {
-                bh->b_page = sh->bh_page[i];
-                bh->b_data = page_address(bh->b_page);
-                clear_buffer_uptodate(bh);
-        }
-#endif
-        clear_bit(R5_LOCKED, &sh->dev[i].flags);
-        set_bit(STRIPE_HANDLE, &sh->state);
-        release_stripe(sh);
-        return 0;
-}
-static int raid6_end_write_request (struct bio *bi, unsigned int bytes_done,
-                                    int error)
-{
-        struct stripe_head *sh = bi->bi_private;
-        raid6_conf_t *conf = sh->raid_conf;
-        int disks = conf->raid_disks, i;
-        unsigned long flags;
-        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
-        if (bi->bi_size)
-                return 1;
-        for (i=0 ; i<disks; i++)
-                if (bi == &sh->dev[i].req)
-                        break;
-        PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
-                (unsigned long long)sh->sector, i, atomic_read(&sh->count),
-                uptodate);
-        if (i == disks) {
-                BUG();
-                return 0;
-        }
-        spin_lock_irqsave(&conf->device_lock, flags);
-        if (!uptodate)
-                md_error(conf->mddev, conf->disks[i].rdev);
-        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-        clear_bit(R5_LOCKED, &sh->dev[i].flags);
-        set_bit(STRIPE_HANDLE, &sh->state);
-        __release_stripe(conf, sh);
-        spin_unlock_irqrestore(&conf->device_lock, flags);
-        return 0;
-}
-static sector_t compute_blocknr(struct stripe_head *sh, int i);
-static void raid6_build_block (struct stripe_head *sh, int i)
-{
-        struct r5dev *dev = &sh->dev[i];
-        int pd_idx = sh->pd_idx;
-        int qd_idx = raid6_next_disk(pd_idx, sh->raid_conf->raid_disks);
-        bio_init(&dev->req);
-        dev->req.bi_io_vec = &dev->vec;
-        dev->req.bi_vcnt++;
-        dev->req.bi_max_vecs++;
-        dev->vec.bv_page = dev->page;
-        dev->vec.bv_len = STRIPE_SIZE;
-        dev->vec.bv_offset = 0;
-        dev->req.bi_sector = sh->sector;
-        dev->req.bi_private = sh;
-        dev->flags = 0;
-        if (i != pd_idx && i != qd_idx)
-                dev->sector = compute_blocknr(sh, i);
-}
-static void error(mddev_t *mddev, mdk_rdev_t *rdev)
-{
-        char b[BDEVNAME_SIZE];
-        raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
-        PRINTK("raid6: error called\n");
-        if (!test_bit(Faulty, &rdev->flags)) {
-                mddev->sb_dirty = 1;
-                if (test_bit(In_sync, &rdev->flags)) {
-                        conf->working_disks--;
-                        mddev->degraded++;
-                        conf->failed_disks++;
-                        clear_bit(In_sync, &rdev->flags);
-                        /*
-                         * if recovery was running, make sure it aborts.
-                         */
-                        set_bit(MD_RECOVERY_ERR, &mddev->recovery);
-                }
-                set_bit(Faulty, &rdev->flags);
-                printk (KERN_ALERT
-                        "raid6: Disk failure on %s, disabling device."
-                        " Operation continuing on %d devices\n",
-                        bdevname(rdev->bdev,b), conf->working_disks);
-        }
-}
-/*
- * Input: a 'big' sector number,
- * Output: index of the data and parity disk, and the sector # in them.
- */
-static sector_t raid6_compute_sector(sector_t r_sector, unsigned int raid_disks,
-                        unsigned int data_disks, unsigned int * dd_idx,
-                        unsigned int * pd_idx, raid6_conf_t *conf)
-{
-        long stripe;
-        unsigned long chunk_number;
-        unsigned int chunk_offset;
-        sector_t new_sector;
-        int sectors_per_chunk = conf->chunk_size >> 9;
-        /* First compute the information on this sector */
-        /*
-         * Compute the chunk number and the sector offset inside the chunk
-         */
-        chunk_offset = sector_div(r_sector, sectors_per_chunk);
-        chunk_number = r_sector;
-        if ( r_sector != chunk_number ) {
-                printk(KERN_CRIT "raid6: ERROR: r_sector = %llu, chunk_number = %lu\n",
-                       (unsigned long long)r_sector, (unsigned long)chunk_number);
-                BUG();
-        }
-        /*
-         * Compute the stripe number
-         */
-        stripe = chunk_number / data_disks;
-        /*
-         * Compute the data disk and parity disk indexes inside the stripe
-         */
-        *dd_idx = chunk_number % data_disks;
-        /*
-         * Select the parity disk based on the user selected algorithm.
-         */
-        /**** FIX THIS ****/
-        switch (conf->algorithm) {
-        case ALGORITHM_LEFT_ASYMMETRIC:
-                *pd_idx = raid_disks - 1 - (stripe % raid_disks);
-                if (*pd_idx == raid_disks-1)
-                        (*dd_idx)++;    /* Q D D D P */
-                else if (*dd_idx >= *pd_idx)
-                        (*dd_idx) += 2; /* D D P Q D */
-                break;
-        case ALGORITHM_RIGHT_ASYMMETRIC:
-                *pd_idx = stripe % raid_disks;
-                if (*pd_idx == raid_disks-1)
-                        (*dd_idx)++;    /* Q D D D P */
-                else if (*dd_idx >= *pd_idx)
-                        (*dd_idx) += 2; /* D D P Q D */
-                break;
-        case ALGORITHM_LEFT_SYMMETRIC:
-                *pd_idx = raid_disks - 1 - (stripe % raid_disks);
-                *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
-                break;
-        case ALGORITHM_RIGHT_SYMMETRIC:
-                *pd_idx = stripe % raid_disks;
-                *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
-                break;
-        default:
-                printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
-                        conf->algorithm);
-        }
-        PRINTK("raid6: chunk_number = %lu, pd_idx = %u, dd_idx = %u\n",
-               chunk_number, *pd_idx, *dd_idx);
-        /*
-         * Finally, compute the new sector number
-         */
-        new_sector = (sector_t) stripe * sectors_per_chunk + chunk_offset;
-        return new_sector;
-}
-static sector_t compute_blocknr(struct stripe_head *sh, int i)
-{
-        raid6_conf_t *conf = sh->raid_conf;
-        int raid_disks = conf->raid_disks, data_disks = raid_disks - 2;
-        sector_t new_sector = sh->sector, check;
-        int sectors_per_chunk = conf->chunk_size >> 9;
-        sector_t stripe;
-        int chunk_offset;
-        int chunk_number, dummy1, dummy2, dd_idx = i;
-        sector_t r_sector;
-        int i0 = i;
-        chunk_offset = sector_div(new_sector, sectors_per_chunk);
-        stripe = new_sector;
-        if ( new_sector != stripe ) {
-                printk(KERN_CRIT "raid6: ERROR: new_sector = %llu, stripe = %lu\n",
-                       (unsigned long long)new_sector, (unsigned long)stripe);
-                BUG();
-        }
-        switch (conf->algorithm) {
-                case ALGORITHM_LEFT_ASYMMETRIC:
-                case ALGORITHM_RIGHT_ASYMMETRIC:
-                        if (sh->pd_idx == raid_disks-1)
-                                i--;    /* Q D D D P */
-                        else if (i > sh->pd_idx)
-                                i -= 2; /* D D P Q D */
-                        break;
-                case ALGORITHM_LEFT_SYMMETRIC:
-                case ALGORITHM_RIGHT_SYMMETRIC:
-                        if (sh->pd_idx == raid_disks-1)
-                                i--; /* Q D D D P */
-                        else {
-                                /* D D P Q D */
-                                if (i < sh->pd_idx)
-                                        i += raid_disks;
-                                i -= (sh->pd_idx + 2);
-                        }
-                        break;
-                default:
-                        printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
-                                conf->algorithm);
-        }
-        PRINTK("raid6: compute_blocknr: pd_idx = %u, i0 = %u, i = %u\n", sh->pd_idx, i0, i);
-        chunk_number = stripe * data_disks + i;
-        r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
-        check = raid6_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
-        if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
-                printk(KERN_CRIT "raid6: compute_blocknr: map not correct\n");
-                return 0;
-        }
-        return r_sector;
-}
-/*
- * Copy data between a page in the stripe cache, and one or more bion
- * The page could align with the middle of the bio, or there could be
- * several bion, each with several bio_vecs, which cover part of the page
- * Multiple bion are linked together on bi_next.  There may be extras
- * at the end of this list.  We ignore them.
- */
-static void copy_data(int frombio, struct bio *bio,
-                     struct page *page,
-                     sector_t sector)
-{
-        char *pa = page_address(page);
-        struct bio_vec *bvl;
-        int i;
-        int page_offset;
-        if (bio->bi_sector >= sector)
-                page_offset = (signed)(bio->bi_sector - sector) * 512;
-        else
-                page_offset = (signed)(sector - bio->bi_sector) * -512;
-        bio_for_each_segment(bvl, bio, i) {
-                int len = bio_iovec_idx(bio,i)->bv_len;
-                int clen;
-                int b_offset = 0;
-                if (page_offset < 0) {
-                        b_offset = -page_offset;
-                        page_offset += b_offset;
-                        len -= b_offset;
-                }
-                if (len > 0 && page_offset + len > STRIPE_SIZE)
-                        clen = STRIPE_SIZE - page_offset;
-                else clen = len;
-                if (clen > 0) {
-                        char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
-                        if (frombio)
-                                memcpy(pa+page_offset, ba+b_offset, clen);
-                        else
-                                memcpy(ba+b_offset, pa+page_offset, clen);
-                        __bio_kunmap_atomic(ba, KM_USER0);
-                }
-                if (clen < len) /* hit end of page */
-                        break;
-                page_offset +=  len;
-        }
-}
-#define check_xor()     do {                                            \
-                           if (count == MAX_XOR_BLOCKS) {               \
-                                xor_block(count, STRIPE_SIZE, ptr);     \
-                                count = 1;                              \
-                           }                                            \
-                        } while(0)
-/* Compute P and Q syndromes */
-static void compute_parity(struct stripe_head *sh, int method)
-{
-        raid6_conf_t *conf = sh->raid_conf;
-        int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
-        struct bio *chosen;
-        /**** FIX THIS: This could be very bad if disks is close to 256 ****/
-        void *ptrs[disks];
-        qd_idx = raid6_next_disk(pd_idx, disks);
-        d0_idx = raid6_next_disk(qd_idx, disks);
-        PRINTK("compute_parity, stripe %llu, method %d\n",
-                (unsigned long long)sh->sector, method);
-        switch(method) {
-        case READ_MODIFY_WRITE:
-                BUG();          /* READ_MODIFY_WRITE N/A for RAID-6 */
-        case RECONSTRUCT_WRITE:
-                for (i= disks; i-- ;)
-                        if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
-                                chosen = sh->dev[i].towrite;
-                                sh->dev[i].towrite = NULL;
-                                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-                                        wake_up(&conf->wait_for_overlap);
-                                BUG_ON(sh->dev[i].written);
-                                sh->dev[i].written = chosen;
-                        }
-                break;
-        case CHECK_PARITY:
-                BUG();          /* Not implemented yet */
-        }
-        for (i = disks; i--;)
-                if (sh->dev[i].written) {
-                        sector_t sector = sh->dev[i].sector;
-                        struct bio *wbi = sh->dev[i].written;
-                        while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-                                copy_data(1, wbi, sh->dev[i].page, sector);
-                                wbi = r5_next_bio(wbi, sector);
-                        }
-                        set_bit(R5_LOCKED, &sh->dev[i].flags);
-                        set_bit(R5_UPTODATE, &sh->dev[i].flags);
-                }
-//      switch(method) {
-//      case RECONSTRUCT_WRITE:
-//      case CHECK_PARITY:
-//      case UPDATE_PARITY:
-                /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
-                /* FIX: Is this ordering of drives even remotely optimal? */
-                count = 0;
-                i = d0_idx;
-                do {
-                        ptrs[count++] = page_address(sh->dev[i].page);
-                        if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
-                                printk("block %d/%d not uptodate on parity calc\n", i,count);
-                        i = raid6_next_disk(i, disks);
-                } while ( i != d0_idx );
-//              break;
-//      }
-        raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
-        switch(method) {
-        case RECONSTRUCT_WRITE:
-                set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-                set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
-                set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
-                set_bit(R5_LOCKED,   &sh->dev[qd_idx].flags);
-                break;
-        case UPDATE_PARITY:
-                set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-                set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
-                break;
-        }
-}
-/* Compute one missing block */
-static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
-{
-        raid6_conf_t *conf = sh->raid_conf;
-        int i, count, disks = conf->raid_disks;
-        void *ptr[MAX_XOR_BLOCKS], *p;
-        int pd_idx = sh->pd_idx;
-        int qd_idx = raid6_next_disk(pd_idx, disks);
-        PRINTK("compute_block_1, stripe %llu, idx %d\n",
-                (unsigned long long)sh->sector, dd_idx);
-        if ( dd_idx == qd_idx ) {
-                /* We're actually computing the Q drive */
-                compute_parity(sh, UPDATE_PARITY);
-        } else {
-                ptr[0] = page_address(sh->dev[dd_idx].page);
-                if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
-                count = 1;
-                for (i = disks ; i--; ) {
-                        if (i == dd_idx || i == qd_idx)
-                                continue;
-                        p = page_address(sh->dev[i].page);
-                        if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
-                                ptr[count++] = p;
-                        else
-                                printk("compute_block() %d, stripe %llu, %d"
-                                       " not present\n", dd_idx,
-                                       (unsigned long long)sh->sector, i);
-                        check_xor();
-                }
-                if (count != 1)
-                        xor_block(count, STRIPE_SIZE, ptr);
-                if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-                else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-        }
-}
-/* Compute two missing blocks */
-static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
-{
-        raid6_conf_t *conf = sh->raid_conf;
-        int i, count, disks = conf->raid_disks;
-        int pd_idx = sh->pd_idx;
-        int qd_idx = raid6_next_disk(pd_idx, disks);
-        int d0_idx = raid6_next_disk(qd_idx, disks);
-        int faila, failb;
-        /* faila and failb are disk numbers relative to d0_idx */
-        /* pd_idx become disks-2 and qd_idx become disks-1 */
-        faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
-        failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
-        BUG_ON(faila == failb);
-        if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
-        PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
-               (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
-        if ( failb == disks-1 ) {
-                /* Q disk is one of the missing disks */
-                if ( faila == disks-2 ) {
-                        /* Missing P+Q, just recompute */
-                        compute_parity(sh, UPDATE_PARITY);
-                        return;
-                } else {
-                        /* We're missing D+Q; recompute D from P */
-                        compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
-                        compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
-                        return;
-                }
-        }
-        /* We're missing D+P or D+D; build pointer table */
-        {
-                /**** FIX THIS: This could be very bad if disks is close to 256 ****/
-                void *ptrs[disks];
-                count = 0;
-                i = d0_idx;
-                do {
-                        ptrs[count++] = page_address(sh->dev[i].page);
-                        i = raid6_next_disk(i, disks);
-                        if (i != dd_idx1 && i != dd_idx2 &&
-                            !test_bit(R5_UPTODATE, &sh->dev[i].flags))
-                                printk("compute_2 with missing block %d/%d\n", count, i);
-                } while ( i != d0_idx );
-                if ( failb == disks-2 ) {
-                        /* We're missing D+P. */
-                        raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
-                } else {
-                        /* We're missing D+D. */
-                        raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
-                }
-                /* Both the above update both missing blocks */
-                set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
-                set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
-        }
-}
-/*
- * Each stripe/dev can have one or more bion attached.
- * toread/towrite point to the first in a chain.
- * The bi_next chain must be in order.
- */
-static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
-{
-        struct bio **bip;
-        raid6_conf_t *conf = sh->raid_conf;
-        int firstwrite=0;
-        PRINTK("adding bh b#%llu to stripe s#%llu\n",
-                (unsigned long long)bi->bi_sector,
-                (unsigned long long)sh->sector);
-        spin_lock(&sh->lock);
-        spin_lock_irq(&conf->device_lock);
-        if (forwrite) {
-                bip = &sh->dev[dd_idx].towrite;
-                if (*bip == NULL && sh->dev[dd_idx].written == NULL)
-                        firstwrite = 1;
-        } else
-                bip = &sh->dev[dd_idx].toread;
-        while (*bip && (*bip)->bi_sector < bi->bi_sector) {
-                if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
-                        goto overlap;
-                bip = &(*bip)->bi_next;
-        }
-        if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
-                goto overlap;
-        BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
-        if (*bip)
-                bi->bi_next = *bip;
-        *bip = bi;
-        bi->bi_phys_segments ++;
-        spin_unlock_irq(&conf->device_lock);
-        spin_unlock(&sh->lock);
-        PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
-                (unsigned long long)bi->bi_sector,
-                (unsigned long long)sh->sector, dd_idx);
-        if (conf->mddev->bitmap && firstwrite) {
-                sh->bm_seq = conf->seq_write;
-                bitmap_startwrite(conf->mddev->bitmap, sh->sector,
-                                  STRIPE_SECTORS, 0);
-                set_bit(STRIPE_BIT_DELAY, &sh->state);
-        }
-        if (forwrite) {
-                /* check if page is covered */
-                sector_t sector = sh->dev[dd_idx].sector;
-                for (bi=sh->dev[dd_idx].towrite;
-                     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
-                             bi && bi->bi_sector <= sector;
-                     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
-                        if (bi->bi_sector + (bi->bi_size>>9) >= sector)
-                                sector = bi->bi_sector + (bi->bi_size>>9);
-                }
-                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
-                        set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
-        }
-        return 1;
- overlap:
-        set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
-        spin_unlock_irq(&conf->device_lock);
-        spin_unlock(&sh->lock);
-        return 0;
-}
-static int page_is_zero(struct page *p)
-{
-        char *a = page_address(p);
-        return ((*(u32*)a) == 0 &&
-                memcmp(a, a+4, STRIPE_SIZE-4)==0);
-}
-/*
- * handle_stripe - do things to a stripe.
- *
- * We lock the stripe and then examine the state of various bits
- * to see what needs to be done.
- * Possible results:
- *    return some read request which now have data
- *    return some write requests which are safely on disc
- *    schedule a read on some buffers
- *    schedule a write of some buffers
- *    return confirmation of parity correctness
- *
- * Parity calculations are done inside the stripe lock
- * buffers are taken off read_list or write_list, and bh_cache buffers
- * get BH_Lock set before the stripe lock is released.
- *
- */
-static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
-{
-        raid6_conf_t *conf = sh->raid_conf;
-        int disks = conf->raid_disks;
-        struct bio *return_bi= NULL;
-        struct bio *bi;
-        int i;
-        int syncing;
-        int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-        int non_overwrite = 0;
-        int failed_num[2] = {0, 0};
-        struct r5dev *dev, *pdev, *qdev;
-        int pd_idx = sh->pd_idx;
-        int qd_idx = raid6_next_disk(pd_idx, disks);
-        int p_failed, q_failed;
-        PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
-               (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
-               pd_idx, qd_idx);
-        spin_lock(&sh->lock);
-        clear_bit(STRIPE_HANDLE, &sh->state);
-        clear_bit(STRIPE_DELAYED, &sh->state);
-        syncing = test_bit(STRIPE_SYNCING, &sh->state);
-        /* Now to look around and see what can be done */
-        rcu_read_lock();
-        for (i=disks; i--; ) {
-                mdk_rdev_t *rdev;
-                dev = &sh->dev[i];
-                clear_bit(R5_Insync, &dev->flags);
-                PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
-                        i, dev->flags, dev->toread, dev->towrite, dev->written);
-                /* maybe we can reply to a read */
-                if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
-                        struct bio *rbi, *rbi2;
-                        PRINTK("Return read for disc %d\n", i);
-                        spin_lock_irq(&conf->device_lock);
-                        rbi = dev->toread;
-                        dev->toread = NULL;
-                        if (test_and_clear_bit(R5_Overlap, &dev->flags))
-                                wake_up(&conf->wait_for_overlap);
-                        spin_unlock_irq(&conf->device_lock);
-                        while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-                                copy_data(0, rbi, dev->page, dev->sector);
-                                rbi2 = r5_next_bio(rbi, dev->sector);
-                                spin_lock_irq(&conf->device_lock);
-                                if (--rbi->bi_phys_segments == 0) {
-                                        rbi->bi_next = return_bi;
-                                        return_bi = rbi;
-                                }
-                                spin_unlock_irq(&conf->device_lock);
-                                rbi = rbi2;
-                        }
-                }
-                /* now count some things */
-                if (test_bit(R5_LOCKED, &dev->flags)) locked++;
-                if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
-                if (dev->toread) to_read++;
-                if (dev->towrite) {
-                        to_write++;
-                        if (!test_bit(R5_OVERWRITE, &dev->flags))
-                                non_overwrite++;
-                }
-                if (dev->written) written++;
-                rdev = rcu_dereference(conf->disks[i].rdev);
-                if (!rdev || !test_bit(In_sync, &rdev->flags)) {
-                        /* The ReadError flag will just be confusing now */
-                        clear_bit(R5_ReadError, &dev->flags);
-                        clear_bit(R5_ReWrite, &dev->flags);
-                }
-                if (!rdev || !test_bit(In_sync, &rdev->flags)
-                    || test_bit(R5_ReadError, &dev->flags)) {
-                        if ( failed < 2 )
-                                failed_num[failed] = i;
-                        failed++;
-                } else
-                        set_bit(R5_Insync, &dev->flags);
-        }
-        rcu_read_unlock();
-        PRINTK("locked=%d uptodate=%d to_read=%d"
-               " to_write=%d failed=%d failed_num=%d,%d\n",
-               locked, uptodate, to_read, to_write, failed,
-               failed_num[0], failed_num[1]);
-        /* check if the array has lost >2 devices and, if so, some requests might
-         * need to be failed
-         */
-        if (failed > 2 && to_read+to_write+written) {
-                for (i=disks; i--; ) {
-                        int bitmap_end = 0;
-                        if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-                                mdk_rdev_t *rdev;
-                                rcu_read_lock();
-                                rdev = rcu_dereference(conf->disks[i].rdev);
-                                if (rdev && test_bit(In_sync, &rdev->flags))
-                                        /* multiple read failures in one stripe */
-                                        md_error(conf->mddev, rdev);
-                                rcu_read_unlock();
-                        }
-                        spin_lock_irq(&conf->device_lock);
-                        /* fail all writes first */
-                        bi = sh->dev[i].towrite;
-                        sh->dev[i].towrite = NULL;
-                        if (bi) { to_write--; bitmap_end = 1; }
-                        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-                                wake_up(&conf->wait_for_overlap);
-                        while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
-                                struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                                if (--bi->bi_phys_segments == 0) {
-                                        md_write_end(conf->mddev);
-                                        bi->bi_next = return_bi;
-                                        return_bi = bi;
-                                }
-                                bi = nextbi;
-                        }
-                        /* and fail all 'written' */
-                        bi = sh->dev[i].written;
-                        sh->dev[i].written = NULL;
-                        if (bi) bitmap_end = 1;
-                        while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
-                                struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
-                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                                if (--bi->bi_phys_segments == 0) {
-                                        md_write_end(conf->mddev);
-                                        bi->bi_next = return_bi;
-                                        return_bi = bi;
-                                }
-                                bi = bi2;
-                        }
-                        /* fail any reads if this device is non-operational */
-                        if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
-                            test_bit(R5_ReadError, &sh->dev[i].flags)) {
-                                bi = sh->dev[i].toread;
-                                sh->dev[i].toread = NULL;
-                                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-                                        wake_up(&conf->wait_for_overlap);
-                                if (bi) to_read--;
-                                while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
-                                        struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-                                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                                        if (--bi->bi_phys_segments == 0) {
-                                                bi->bi_next = return_bi;
-                                                return_bi = bi;
-                                        }
-                                        bi = nextbi;
-                                }
-                        }
-                        spin_unlock_irq(&conf->device_lock);
-                        if (bitmap_end)
-                                bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-                                                STRIPE_SECTORS, 0, 0);
-                }
-        }
-        if (failed > 2 && syncing) {
-                md_done_sync(conf->mddev, STRIPE_SECTORS,0);
-                clear_bit(STRIPE_SYNCING, &sh->state);
-                syncing = 0;
-        }
-        /*
-         * might be able to return some write requests if the parity blocks
-         * are safe, or on a failed drive
-         */
-        pdev = &sh->dev[pd_idx];
-        p_failed = (failed >= 1 && failed_num[0] == pd_idx)
-                || (failed >= 2 && failed_num[1] == pd_idx);
-        qdev = &sh->dev[qd_idx];
-        q_failed = (failed >= 1 && failed_num[0] == qd_idx)
-                || (failed >= 2 && failed_num[1] == qd_idx);
-        if ( written &&
-             ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
-                             && !test_bit(R5_LOCKED, &pdev->flags)
-                             && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
-             ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
-                             && !test_bit(R5_LOCKED, &qdev->flags)
-                             && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
-                /* any written block on an uptodate or failed drive can be
-                 * returned.  Note that if we 'wrote' to a failed drive,
-                 * it will be UPTODATE, but never LOCKED, so we don't need
-                 * to test 'failed' directly.
-                 */
-                for (i=disks; i--; )
-                        if (sh->dev[i].written) {
-                                dev = &sh->dev[i];
-                                if (!test_bit(R5_LOCKED, &dev->flags) &&
-                                    test_bit(R5_UPTODATE, &dev->flags) ) {
-                                        /* We can return any write requests */
-                                        int bitmap_end = 0;
-                                        struct bio *wbi, *wbi2;
-                                        PRINTK("Return write for stripe %llu disc %d\n",
-                                               (unsigned long long)sh->sector, i);
-                                        spin_lock_irq(&conf->device_lock);
-                                        wbi = dev->written;
-                                        dev->written = NULL;
-                                        while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-                                                wbi2 = r5_next_bio(wbi, dev->sector);
-                                                if (--wbi->bi_phys_segments == 0) {
-                                                        md_write_end(conf->mddev);
-                                                        wbi->bi_next = return_bi;
-                                                        return_bi = wbi;
-                                                }
-                                                wbi = wbi2;
-                                        }
-                                        if (dev->towrite == NULL)
-                                                bitmap_end = 1;
-                                        spin_unlock_irq(&conf->device_lock);
-                                        if (bitmap_end)
-                                                bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-                                                                STRIPE_SECTORS,
-                                                                !test_bit(STRIPE_DEGRADED, &sh->state), 0);
-                                }
-                        }
-        }
-        /* Now we might consider reading some blocks, either to check/generate
-         * parity, or to satisfy requests
-         * or to load a block that is being partially written.
-         */
-        if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
-                for (i=disks; i--;) {
-                        dev = &sh->dev[i];
-                        if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-                            (dev->toread ||
-                             (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-                             syncing ||
-                             (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
-                             (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
-                                    )
-                                ) {
-                                /* we would like to get this block, possibly
-                                 * by computing it, but we might not be able to
-                                 */
-                                if (uptodate == disks-1) {
-                                        PRINTK("Computing stripe %llu block %d\n",
-                                               (unsigned long long)sh->sector, i);
-                                        compute_block_1(sh, i, 0);
-                                        uptodate++;
-                                } else if ( uptodate == disks-2 && failed >= 2 ) {
-                                        /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
-                                        int other;
-                                        for (other=disks; other--;) {
-                                                if ( other == i )
-                                                        continue;
-                                                if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
-                                                        break;
-                                        }
-                                        BUG_ON(other < 0);
-                                        PRINTK("Computing stripe %llu blocks %d,%d\n",
-                                               (unsigned long long)sh->sector, i, other);
-                                        compute_block_2(sh, i, other);
-                                        uptodate += 2;
-                                } else if (test_bit(R5_Insync, &dev->flags)) {
-                                        set_bit(R5_LOCKED, &dev->flags);
-                                        set_bit(R5_Wantread, &dev->flags);
-#if 0
-                                        /* if I am just reading this block and we don't have
-                                           a failed drive, or any pending writes then sidestep the cache */
-                                        if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
-                                            ! syncing && !failed && !to_write) {
-                                                sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
-                                                sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
-                                        }
-#endif
-                                        locked++;
-                                        PRINTK("Reading block %d (sync=%d)\n",
-                                                i, syncing);
-                                }
-                        }
-                }
-                set_bit(STRIPE_HANDLE, &sh->state);
-        }
-        /* now to consider writing and what else, if anything should be read */
-        if (to_write) {
-                int rcw=0, must_compute=0;
-                for (i=disks ; i--;) {
-                        dev = &sh->dev[i];
-                        /* Would I have to read this buffer for reconstruct_write */
-                        if (!test_bit(R5_OVERWRITE, &dev->flags)
-                            && i != pd_idx && i != qd_idx
-                            && (!test_bit(R5_LOCKED, &dev->flags)
-#if 0
-                                || sh->bh_page[i] != bh->b_page
-#endif
-                                    ) &&
-                            !test_bit(R5_UPTODATE, &dev->flags)) {
-                                if (test_bit(R5_Insync, &dev->flags)) rcw++;
-                                else {
-                                        PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
-                                        must_compute++;
-                                }
-                        }
-                }
-                PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
-                       (unsigned long long)sh->sector, rcw, must_compute);
-                set_bit(STRIPE_HANDLE, &sh->state);
-                if (rcw > 0)
-                        /* want reconstruct write, but need to get some data */
-                        for (i=disks; i--;) {
-                                dev = &sh->dev[i];
-                                if (!test_bit(R5_OVERWRITE, &dev->flags)
-                                    && !(failed == 0 && (i == pd_idx || i == qd_idx))
-                                    && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-                                    test_bit(R5_Insync, &dev->flags)) {
-                                        if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-                                        {
-                                                PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
-                                                       (unsigned long long)sh->sector, i);
-                                                set_bit(R5_LOCKED, &dev->flags);
-                                                set_bit(R5_Wantread, &dev->flags);
-                                                locked++;
-                                        } else {
-                                                PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
-                                                       (unsigned long long)sh->sector, i);
-                                                set_bit(STRIPE_DELAYED, &sh->state);
-                                                set_bit(STRIPE_HANDLE, &sh->state);
-                                        }
-                                }
-                        }
-                /* now if nothing is locked, and if we have enough data, we can start a write request */
-                if (locked == 0 && rcw == 0 &&
-                    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
-                        if ( must_compute > 0 ) {
-                                /* We have failed blocks and need to compute them */
-                                switch ( failed ) {
-                                case 0: BUG();
-                                case 1: compute_block_1(sh, failed_num[0], 0); break;
-                                case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
-                                default: BUG(); /* This request should have been failed? */
-                                }
-                        }
-                        PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
-                        compute_parity(sh, RECONSTRUCT_WRITE);
-                        /* now every locked buffer is ready to be written */
-                        for (i=disks; i--;)
-                                if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
-                                        PRINTK("Writing stripe %llu block %d\n",
-                                               (unsigned long long)sh->sector, i);
-                                        locked++;
-                                        set_bit(R5_Wantwrite, &sh->dev[i].flags);
-                                }
-                        /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
-                        set_bit(STRIPE_INSYNC, &sh->state);
-                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-                                atomic_dec(&conf->preread_active_stripes);
-                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-                                        md_wakeup_thread(conf->mddev->thread);
-                        }
-                }
-        }
-        /* maybe we need to check and possibly fix the parity for this stripe
-         * Any reads will already have been scheduled, so we just see if enough data
-         * is available
-         */
-        if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
-                int update_p = 0, update_q = 0;
-                struct r5dev *dev;
-                set_bit(STRIPE_HANDLE, &sh->state);
-                BUG_ON(failed>2);
-                BUG_ON(uptodate < disks);
-                /* Want to check and possibly repair P and Q.
-                 * However there could be one 'failed' device, in which
-                 * case we can only check one of them, possibly using the
-                 * other to generate missing data
-                 */
-                /* If !tmp_page, we cannot do the calculations,
-                 * but as we have set STRIPE_HANDLE, we will soon be called
-                 * by stripe_handle with a tmp_page - just wait until then.
-                 */
-                if (tmp_page) {
-                        if (failed == q_failed) {
-                                /* The only possible failed device holds 'Q', so it makes
-                                 * sense to check P (If anything else were failed, we would
-                                 * have used P to recreate it).
-                                 */
-                                compute_block_1(sh, pd_idx, 1);
-                                if (!page_is_zero(sh->dev[pd_idx].page)) {
-                                        compute_block_1(sh,pd_idx,0);
-                                        update_p = 1;
-                                }
-                        }
-                        if (!q_failed && failed < 2) {
-                                /* q is not failed, and we didn't use it to generate
-                                 * anything, so it makes sense to check it
-                                 */
-                                memcpy(page_address(tmp_page),
-                                       page_address(sh->dev[qd_idx].page),
-                                       STRIPE_SIZE);
-                                compute_parity(sh, UPDATE_PARITY);
-                                if (memcmp(page_address(tmp_page),
-                                           page_address(sh->dev[qd_idx].page),
-                                           STRIPE_SIZE)!= 0) {
-                                        clear_bit(STRIPE_INSYNC, &sh->state);
-                                        update_q = 1;
-                                }
-                        }
-                        if (update_p || update_q) {
-                                conf->mddev->resync_mismatches += STRIPE_SECTORS;
-                                if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
-                                        /* don't try to repair!! */
-                                        update_p = update_q = 0;
-                        }
-                        /* now write out any block on a failed drive,
-                         * or P or Q if they need it
-                         */
-                        if (failed == 2) {
-                                dev = &sh->dev[failed_num[1]];
-                                locked++;
-                                set_bit(R5_LOCKED, &dev->flags);
-                                set_bit(R5_Wantwrite, &dev->flags);
-                        }
-                        if (failed >= 1) {
-                                dev = &sh->dev[failed_num[0]];
-                                locked++;
-                                set_bit(R5_LOCKED, &dev->flags);
-                                set_bit(R5_Wantwrite, &dev->flags);
-                        }
-                        if (update_p) {
-                                dev = &sh->dev[pd_idx];
-                                locked ++;
-                                set_bit(R5_LOCKED, &dev->flags);
-                                set_bit(R5_Wantwrite, &dev->flags);
-                        }
-                        if (update_q) {
-                                dev = &sh->dev[qd_idx];
-                                locked++;
-                                set_bit(R5_LOCKED, &dev->flags);
-                                set_bit(R5_Wantwrite, &dev->flags);
-                        }
-                        clear_bit(STRIPE_DEGRADED, &sh->state);
-                        set_bit(STRIPE_INSYNC, &sh->state);
-                }
-        }
-        if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
-                md_done_sync(conf->mddev, STRIPE_SECTORS,1);
-                clear_bit(STRIPE_SYNCING, &sh->state);
-        }
-        /* If the failed drives are just a ReadError, then we might need
-         * to progress the repair/check process
-         */
-        if (failed <= 2 && ! conf->mddev->ro)
-                for (i=0; i<failed;i++) {
-                        dev = &sh->dev[failed_num[i]];
-                        if (test_bit(R5_ReadError, &dev->flags)
-                            && !test_bit(R5_LOCKED, &dev->flags)
-                            && test_bit(R5_UPTODATE, &dev->flags)
-                                ) {
-                                if (!test_bit(R5_ReWrite, &dev->flags)) {
-                                        set_bit(R5_Wantwrite, &dev->flags);
-                                        set_bit(R5_ReWrite, &dev->flags);
-                                        set_bit(R5_LOCKED, &dev->flags);
-                                } else {
-                                        /* let's read it back */
-                                        set_bit(R5_Wantread, &dev->flags);
-                                        set_bit(R5_LOCKED, &dev->flags);
-                                }
-                        }
-                }
-        spin_unlock(&sh->lock);
-        while ((bi=return_bi)) {
-                int bytes = bi->bi_size;
-                return_bi = bi->bi_next;
-                bi->bi_next = NULL;
-                bi->bi_size = 0;
-                bi->bi_end_io(bi, bytes, 0);
-        }
-        for (i=disks; i-- ;) {
-                int rw;
-                struct bio *bi;
-                mdk_rdev_t *rdev;
-                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-                        rw = 1;
-                else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
-                        rw = 0;
-                else
-                        continue;
-                bi = &sh->dev[i].req;
-                bi->bi_rw = rw;
-                if (rw)
-                        bi->bi_end_io = raid6_end_write_request;
-                else
-                        bi->bi_end_io = raid6_end_read_request;
-                rcu_read_lock();
-                rdev = rcu_dereference(conf->disks[i].rdev);
-                if (rdev && test_bit(Faulty, &rdev->flags))
-                        rdev = NULL;
-                if (rdev)
-                        atomic_inc(&rdev->nr_pending);
-                rcu_read_unlock();
-                if (rdev) {
-                        if (syncing)
-                                md_sync_acct(rdev->bdev, STRIPE_SECTORS);
-                        bi->bi_bdev = rdev->bdev;
-                        PRINTK("for %llu schedule op %ld on disc %d\n",
-                                (unsigned long long)sh->sector, bi->bi_rw, i);
-                        atomic_inc(&sh->count);
-                        bi->bi_sector = sh->sector + rdev->data_offset;
-                        bi->bi_flags = 1 << BIO_UPTODATE;
-                        bi->bi_vcnt = 1;
-                        bi->bi_max_vecs = 1;
-                        bi->bi_idx = 0;
-                        bi->bi_io_vec = &sh->dev[i].vec;
-                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
-                        bi->bi_io_vec[0].bv_offset = 0;
-                        bi->bi_size = STRIPE_SIZE;
-                        bi->bi_next = NULL;
-                        if (rw == WRITE &&
-                            test_bit(R5_ReWrite, &sh->dev[i].flags))
-                                atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
-                        generic_make_request(bi);
-                } else {
-                        if (rw == 1)
-                                set_bit(STRIPE_DEGRADED, &sh->state);
-                        PRINTK("skip op %ld on disc %d for sector %llu\n",
-                                bi->bi_rw, i, (unsigned long long)sh->sector);
-                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
-                        set_bit(STRIPE_HANDLE, &sh->state);
-                }
-        }
-}
-static void raid6_activate_delayed(raid6_conf_t *conf)
-{
-        if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
-                while (!list_empty(&conf->delayed_list)) {
-                        struct list_head *l = conf->delayed_list.next;
-                        struct stripe_head *sh;
-                        sh = list_entry(l, struct stripe_head, lru);
-                        list_del_init(l);
-                        clear_bit(STRIPE_DELAYED, &sh->state);
-                        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-                                atomic_inc(&conf->preread_active_stripes);
-                        list_add_tail(&sh->lru, &conf->handle_list);
-                }
-        }
-}
-static void activate_bit_delay(raid6_conf_t *conf)
-{
-        /* device_lock is held */
-        struct list_head head;
-        list_add(&head, &conf->bitmap_list);
-        list_del_init(&conf->bitmap_list);
-        while (!list_empty(&head)) {
-                struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
-                list_del_init(&sh->lru);
-                atomic_inc(&sh->count);
-                __release_stripe(conf, sh);
-        }
-}
-static void unplug_slaves(mddev_t *mddev)
-{
-        raid6_conf_t *conf = mddev_to_conf(mddev);
-        int i;
-        rcu_read_lock();
-        for (i=0; i<mddev->raid_disks; i++) {
-                mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-                if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-                        request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
-                        atomic_inc(&rdev->nr_pending);
-                        rcu_read_unlock();
-                        if (r_queue->unplug_fn)
-                                r_queue->unplug_fn(r_queue);
-                        rdev_dec_pending(rdev, mddev);
-                        rcu_read_lock();
-                }
-        }
-        rcu_read_unlock();
-}
-static void raid6_unplug_device(request_queue_t *q)
-{
-        mddev_t *mddev = q->queuedata;
-        raid6_conf_t *conf = mddev_to_conf(mddev);
-        unsigned long flags;
-        spin_lock_irqsave(&conf->device_lock, flags);
-        if (blk_remove_plug(q)) {
-                conf->seq_flush++;
-                raid6_activate_delayed(conf);
-        }
-        md_wakeup_thread(mddev->thread);
-        spin_unlock_irqrestore(&conf->device_lock, flags);
-        unplug_slaves(mddev);
-}
-static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
-                             sector_t *error_sector)
-{
-        mddev_t *mddev = q->queuedata;
-        raid6_conf_t *conf = mddev_to_conf(mddev);
-        int i, ret = 0;
-        rcu_read_lock();
-        for (i=0; i<mddev->raid_disks && ret == 0; i++) {
-                mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-                if (rdev && !test_bit(Faulty, &rdev->flags)) {
-                        struct block_device *bdev = rdev->bdev;
-                        request_queue_t *r_queue = bdev_get_queue(bdev);
-                        if (!r_queue->issue_flush_fn)
-                                ret = -EOPNOTSUPP;
-                        else {
-                                atomic_inc(&rdev->nr_pending);
-                                rcu_read_unlock();
-                                ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
-                                                              error_sector);
-                                rdev_dec_pending(rdev, mddev);
-                                rcu_read_lock();
-                        }
-                }
-        }
-        rcu_read_unlock();
-        return ret;
-}
-static inline void raid6_plug_device(raid6_conf_t *conf)
-{
-        spin_lock_irq(&conf->device_lock);
-        blk_plug_device(conf->mddev->queue);
-        spin_unlock_irq(&conf->device_lock);
-}
-static int make_request (request_queue_t *q, struct bio * bi)
-{
-        mddev_t *mddev = q->queuedata;
-        raid6_conf_t *conf = mddev_to_conf(mddev);
-        const unsigned int raid_disks = conf->raid_disks;
-        const unsigned int data_disks = raid_disks - 2;
-        unsigned int dd_idx, pd_idx;
-        sector_t new_sector;
-        sector_t logical_sector, last_sector;
-        struct stripe_head *sh;
-        const int rw = bio_data_dir(bi);
-        if (unlikely(bio_barrier(bi))) {
-                bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
-                return 0;
-        }
-        md_write_start(mddev, bi);
-        disk_stat_inc(mddev->gendisk, ios[rw]);
-        disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
-        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-        last_sector = bi->bi_sector + (bi->bi_size>>9);
-        bi->bi_next = NULL;
-        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
-        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
-                DEFINE_WAIT(w);
-                new_sector = raid6_compute_sector(logical_sector,
-                                                  raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-                PRINTK("raid6: make_request, sector %llu logical %llu\n",
-                       (unsigned long long)new_sector,
-                       (unsigned long long)logical_sector);
-        retry:
-                prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-                sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
-                if (sh) {
-                        if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
-                                /* Add failed due to overlap.  Flush everything
-                                 * and wait a while
-                                 */
-                                raid6_unplug_device(mddev->queue);
-                                release_stripe(sh);
-                                schedule();
-                                goto retry;
-                        }
-                        finish_wait(&conf->wait_for_overlap, &w);
-                        raid6_plug_device(conf);
-                        handle_stripe(sh, NULL);
-                        release_stripe(sh);
-                } else {
-                        /* cannot get stripe for read-ahead, just give-up */
-                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                        finish_wait(&conf->wait_for_overlap, &w);
-                        break;
-                }
-        }
-        spin_lock_irq(&conf->device_lock);
-        if (--bi->bi_phys_segments == 0) {
-                int bytes = bi->bi_size;
-                if (rw == WRITE )
-                        md_write_end(mddev);
-                bi->bi_size = 0;
-                bi->bi_end_io(bi, bytes, 0);
-        }
-        spin_unlock_irq(&conf->device_lock);
-        return 0;
-}
-/* FIXME go_faster isn't used */
-static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
-{
-        raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
-        struct stripe_head *sh;
-        int sectors_per_chunk = conf->chunk_size >> 9;
-        sector_t x;
-        unsigned long stripe;
-        int chunk_offset;
-        int dd_idx, pd_idx;
-        sector_t first_sector;
-        int raid_disks = conf->raid_disks;
-        int data_disks = raid_disks - 2;
-        sector_t max_sector = mddev->size << 1;
-        int sync_blocks;
-        int still_degraded = 0;
-        int i;
-        if (sector_nr >= max_sector) {
-                /* just being told to finish up .. nothing much to do */
-                unplug_slaves(mddev);
-                if (mddev->curr_resync < max_sector) /* aborted */
-                        bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
-                                        &sync_blocks, 1);
-                else /* completed sync */
-                        conf->fullsync = 0;
-                bitmap_close_sync(mddev->bitmap);
-                return 0;
-        }
-        /* if there are 2 or more failed drives and we are trying
-         * to resync, then assert that we are finished, because there is
-         * nothing we can do.
-         */
-        if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-                sector_t rv = (mddev->size << 1) - sector_nr;
-                *skipped = 1;
-                return rv;
-        }
-        if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
-            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
-            !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
-                /* we can skip this block, and probably more */
-                sync_blocks /= STRIPE_SECTORS;
-                *skipped = 1;
-                return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
-        }
-        x = sector_nr;
-        chunk_offset = sector_div(x, sectors_per_chunk);
-        stripe = x;
-        BUG_ON(x != stripe);
-        first_sector = raid6_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
-                + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-        sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
-        if (sh == NULL) {
-                sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
-                /* make sure we don't swamp the stripe cache if someone else
-                 * is trying to get access
-                 */
-                schedule_timeout_uninterruptible(1);
-        }
-        /* Need to check if array will still be degraded after recovery/resync
-         * We don't need to check the 'failed' flag as when that gets set,
-         * recovery aborts.
-         */
-        for (i=0; i<mddev->raid_disks; i++)
-                if (conf->disks[i].rdev == NULL)
-                        still_degraded = 1;
-        bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
-        spin_lock(&sh->lock);
-        set_bit(STRIPE_SYNCING, &sh->state);
-        clear_bit(STRIPE_INSYNC, &sh->state);
-        spin_unlock(&sh->lock);
-        handle_stripe(sh, NULL);
-        release_stripe(sh);
-        return STRIPE_SECTORS;
-}
-/*
- * This is our raid6 kernel thread.
- *
- * We scan the hash table for stripes which can be handled now.
- * During the scan, completed stripes are saved for us by the interrupt
- * handler, so that they will not have to wait for our next wakeup.
- */
-static void raid6d (mddev_t *mddev)
-{
-        struct stripe_head *sh;
-        raid6_conf_t *conf = mddev_to_conf(mddev);
-        int handled;
-        PRINTK("+++ raid6d active\n");
-        md_check_recovery(mddev);
-        handled = 0;
-        spin_lock_irq(&conf->device_lock);
-        while (1) {
-                struct list_head *first;
-                if (conf->seq_flush - conf->seq_write > 0) {
-                        int seq = conf->seq_flush;
-                        spin_unlock_irq(&conf->device_lock);
-                        bitmap_unplug(mddev->bitmap);
-                        spin_lock_irq(&conf->device_lock);
-                        conf->seq_write = seq;
-                        activate_bit_delay(conf);
-                }
-                if (list_empty(&conf->handle_list) &&
-                    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
-                    !blk_queue_plugged(mddev->queue) &&
-                    !list_empty(&conf->delayed_list))
-                        raid6_activate_delayed(conf);
-                if (list_empty(&conf->handle_list))
-                        break;
-                first = conf->handle_list.next;
-                sh = list_entry(first, struct stripe_head, lru);
-                list_del_init(first);
-                atomic_inc(&sh->count);
-                BUG_ON(atomic_read(&sh->count)!= 1);
-                spin_unlock_irq(&conf->device_lock);
-                handled++;
-                handle_stripe(sh, conf->spare_page);
-                release_stripe(sh);
-                spin_lock_irq(&conf->device_lock);
-        }
-        PRINTK("%d stripes handled\n", handled);
-        spin_unlock_irq(&conf->device_lock);
-        unplug_slaves(mddev);
-        PRINTK("--- raid6d inactive\n");
-}
-static ssize_t
-raid6_show_stripe_cache_size(mddev_t *mddev, char *page)
-{
-        raid6_conf_t *conf = mddev_to_conf(mddev);
-        if (conf)
-                return sprintf(page, "%d\n", conf->max_nr_stripes);
-        else
-                return 0;
-}
-static ssize_t
-raid6_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
-{
-        raid6_conf_t *conf = mddev_to_conf(mddev);
-        char *end;
-        int new;
-        if (len >= PAGE_SIZE)
-                return -EINVAL;
-        if (!conf)
-                return -ENODEV;
-        new = simple_strtoul(page, &end, 10);
-        if (!*page || (*end && *end != '\n') )
-                return -EINVAL;
-        if (new <= 16 || new > 32768)
-                return -EINVAL;
-        while (new < conf->max_nr_stripes) {
-                if (drop_one_stripe(conf))
-                        conf->max_nr_stripes--;
-                else
-                        break;
-        }
-        while (new > conf->max_nr_stripes) {
-                if (grow_one_stripe(conf))
-                        conf->max_nr_stripes++;
-                else break;
-        }
-        return len;
-}
-static struct md_sysfs_entry
-raid6_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
-                                raid6_show_stripe_cache_size,
-                                raid6_store_stripe_cache_size);
-static ssize_t
-stripe_cache_active_show(mddev_t *mddev, char *page)
-{
-        raid6_conf_t *conf = mddev_to_conf(mddev);
-        if (conf)
-                return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
-        else
-                return 0;
-}
-static struct md_sysfs_entry
-raid6_stripecache_active = __ATTR_RO(stripe_cache_active);
-static struct attribute *raid6_attrs[] =  {
-        &raid6_stripecache_size.attr,
-        &raid6_stripecache_active.attr,
-        NULL,
-};
-static struct attribute_group raid6_attrs_group = {
-        .name = NULL,
-        .attrs = raid6_attrs,
-};
-static int run(mddev_t *mddev)
-{
-        raid6_conf_t *conf;
-        int raid_disk, memory;
-        mdk_rdev_t *rdev;
-        struct disk_info *disk;
-        struct list_head *tmp;
-        if (mddev->level != 6) {
-                PRINTK("raid6: %s: raid level not set to 6 (%d)\n", mdname(mddev), mddev->level);
-                return -EIO;
-        }
-        mddev->private = kzalloc(sizeof (raid6_conf_t), GFP_KERNEL);
-        if ((conf = mddev->private) == NULL)
-                goto abort;
-        conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
-                                 GFP_KERNEL);
-        if (!conf->disks)
-                goto abort;
-        conf->mddev = mddev;
-        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
-                goto abort;
-        conf->spare_page = alloc_page(GFP_KERNEL);
-        if (!conf->spare_page)
-                goto abort;
-        spin_lock_init(&conf->device_lock);
-        init_waitqueue_head(&conf->wait_for_stripe);
-        init_waitqueue_head(&conf->wait_for_overlap);
-        INIT_LIST_HEAD(&conf->handle_list);
-        INIT_LIST_HEAD(&conf->delayed_list);
-        INIT_LIST_HEAD(&conf->bitmap_list);
-        INIT_LIST_HEAD(&conf->inactive_list);
-        atomic_set(&conf->active_stripes, 0);
-        atomic_set(&conf->preread_active_stripes, 0);
-        PRINTK("raid6: run(%s) called.\n", mdname(mddev));
-        ITERATE_RDEV(mddev,rdev,tmp) {
-                raid_disk = rdev->raid_disk;
-                if (raid_disk >= mddev->raid_disks
-                    || raid_disk < 0)
-                        continue;
-                disk = conf->disks + raid_disk;
-                disk->rdev = rdev;
-                if (test_bit(In_sync, &rdev->flags)) {
-                        char b[BDEVNAME_SIZE];
-                        printk(KERN_INFO "raid6: device %s operational as raid"
-                               " disk %d\n", bdevname(rdev->bdev,b),
-                               raid_disk);
-                        conf->working_disks++;
-                }
-        }
-        conf->raid_disks = mddev->raid_disks;
-        /*
-         * 0 for a fully functional array, 1 or 2 for a degraded array.
-         */
-        mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
-        conf->mddev = mddev;
-        conf->chunk_size = mddev->chunk_size;
-        conf->level = mddev->level;
-        conf->algorithm = mddev->layout;
-        conf->max_nr_stripes = NR_STRIPES;
-        /* device size must be a multiple of chunk size */
-        mddev->size &= ~(mddev->chunk_size/1024 -1);
-        mddev->resync_max_sectors = mddev->size << 1;
-        if (conf->raid_disks < 4) {
-                printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
-                       mdname(mddev), conf->raid_disks);
-                goto abort;
-        }
-        if (!conf->chunk_size || conf->chunk_size % 4) {
-                printk(KERN_ERR "raid6: invalid chunk size %d for %s\n",
-                       conf->chunk_size, mdname(mddev));
-                goto abort;
-        }
-        if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
-                printk(KERN_ERR
-                       "raid6: unsupported parity algorithm %d for %s\n",
-                       conf->algorithm, mdname(mddev));
-                goto abort;
-        }
-        if (mddev->degraded > 2) {
-                printk(KERN_ERR "raid6: not enough operational devices for %s"
-                       " (%d/%d failed)\n",
-                       mdname(mddev), conf->failed_disks, conf->raid_disks);
-                goto abort;
-        }
-        if (mddev->degraded > 0 &&
-            mddev->recovery_cp != MaxSector) {
-                if (mddev->ok_start_degraded)
-                        printk(KERN_WARNING "raid6: starting dirty degraded array:%s"
-                               "- data corruption possible.\n",
-                               mdname(mddev));
-                else {
-                        printk(KERN_ERR "raid6: cannot start dirty degraded array"
-                               " for %s\n", mdname(mddev));
-                        goto abort;
-                }
-        }
-        {
-                mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
-                if (!mddev->thread) {
-                        printk(KERN_ERR
-                               "raid6: couldn't allocate thread for %s\n",
-                               mdname(mddev));
-                        goto abort;
-                }
-        }
-        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
-                 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
-        if (grow_stripes(conf, conf->max_nr_stripes)) {
-                printk(KERN_ERR
-                       "raid6: couldn't allocate %dkB for buffers\n", memory);
-                shrink_stripes(conf);
-                md_unregister_thread(mddev->thread);
-                goto abort;
-        } else
-                printk(KERN_INFO "raid6: allocated %dkB for %s\n",
-                       memory, mdname(mddev));
-        if (mddev->degraded == 0)
-                printk(KERN_INFO "raid6: raid level %d set %s active with %d out of %d"
-                       " devices, algorithm %d\n", conf->level, mdname(mddev),
-                       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
-                       conf->algorithm);
-        else
-                printk(KERN_ALERT "raid6: raid level %d set %s active with %d"
-                       " out of %d devices, algorithm %d\n", conf->level,
-                       mdname(mddev), mddev->raid_disks - mddev->degraded,
-                       mddev->raid_disks, conf->algorithm);
-        print_raid6_conf(conf);
-        /* read-ahead size must cover two whole stripes, which is
-         * 2 * (n-2) * chunksize where 'n' is the number of raid devices
-         */
-        {
-                int stripe = (mddev->raid_disks-2) * mddev->chunk_size
-                        / PAGE_SIZE;
-                if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
-                        mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
-        }
-        /* Ok, everything is just fine now */
-        sysfs_create_group(&mddev->kobj, &raid6_attrs_group);
-        mddev->array_size =  mddev->size * (mddev->raid_disks - 2);
-        mddev->queue->unplug_fn = raid6_unplug_device;
-        mddev->queue->issue_flush_fn = raid6_issue_flush;
-        return 0;
-abort:
-        if (conf) {
-                print_raid6_conf(conf);
-                safe_put_page(conf->spare_page);
-                kfree(conf->stripe_hashtbl);
-                kfree(conf->disks);
-                kfree(conf);
-        }
-        mddev->private = NULL;
-        printk(KERN_ALERT "raid6: failed to run raid set %s\n", mdname(mddev));
-        return -EIO;
-}
-static int stop (mddev_t *mddev)
-{
-        raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
-        md_unregister_thread(mddev->thread);
-        mddev->thread = NULL;
-        shrink_stripes(conf);
-        kfree(conf->stripe_hashtbl);
-        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
-        sysfs_remove_group(&mddev->kobj, &raid6_attrs_group);
-        kfree(conf);
-        mddev->private = NULL;
-        return 0;
-}
-#if RAID6_DUMPSTATE
-static void print_sh (struct seq_file *seq, struct stripe_head *sh)
-{
-        int i;
-        seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
-                   (unsigned long long)sh->sector, sh->pd_idx, sh->state);
-        seq_printf(seq, "sh %llu,  count %d.\n",
-                   (unsigned long long)sh->sector, atomic_read(&sh->count));
-        seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
-        for (i = 0; i < sh->raid_conf->raid_disks; i++) {
-                seq_printf(seq, "(cache%d: %p %ld) ",
-                           i, sh->dev[i].page, sh->dev[i].flags);
-        }
-        seq_printf(seq, "\n");
-}
-static void printall (struct seq_file *seq, raid6_conf_t *conf)
-{
-        struct stripe_head *sh;
-        struct hlist_node *hn;
-        int i;
-        spin_lock_irq(&conf->device_lock);
-        for (i = 0; i < NR_HASH; i++) {
-                sh = conf->stripe_hashtbl[i];
-                hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
-                        if (sh->raid_conf != conf)
-                                continue;
-                        print_sh(seq, sh);
-                }
-        }
-        spin_unlock_irq(&conf->device_lock);
-}
-#endif
-static void status (struct seq_file *seq, mddev_t *mddev)
-{
-        raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
-        int i;
-        seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
-        seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
-        for (i = 0; i < conf->raid_disks; i++)
-                seq_printf (seq, "%s",
-                            conf->disks[i].rdev &&
-                            test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
-        seq_printf (seq, "]");
-#if RAID6_DUMPSTATE
-        seq_printf (seq, "\n");
-        printall(seq, conf);
-#endif
-}
-static void print_raid6_conf (raid6_conf_t *conf)
-{
-        int i;
-        struct disk_info *tmp;
-        printk("RAID6 conf printout:\n");
-        if (!conf) {
-                printk("(conf==NULL)\n");
-                return;
-        }
-        printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
-                 conf->working_disks, conf->failed_disks);
-        for (i = 0; i < conf->raid_disks; i++) {
-                char b[BDEVNAME_SIZE];
-                tmp = conf->disks + i;
-                if (tmp->rdev)
-                printk(" disk %d, o:%d, dev:%s\n",
-                        i, !test_bit(Faulty, &tmp->rdev->flags),
-                        bdevname(tmp->rdev->bdev,b));
-        }
-}
-static int raid6_spare_active(mddev_t *mddev)
-{
-        int i;
-        raid6_conf_t *conf = mddev->private;
-        struct disk_info *tmp;
-        for (i = 0; i < conf->raid_disks; i++) {
-                tmp = conf->disks + i;
-                if (tmp->rdev
-                    && !test_bit(Faulty, &tmp->rdev->flags)
-                    && !test_bit(In_sync, &tmp->rdev->flags)) {
-                        mddev->degraded--;
-                        conf->failed_disks--;
-                        conf->working_disks++;
-                        set_bit(In_sync, &tmp->rdev->flags);
-                }
-        }
-        print_raid6_conf(conf);
-        return 0;
-}
-static int raid6_remove_disk(mddev_t *mddev, int number)
-{
-        raid6_conf_t *conf = mddev->private;
-        int err = 0;
-        mdk_rdev_t *rdev;
-        struct disk_info *p = conf->disks + number;
-        print_raid6_conf(conf);
-        rdev = p->rdev;
-        if (rdev) {
-                if (test_bit(In_sync, &rdev->flags) ||
-                    atomic_read(&rdev->nr_pending)) {
-                        err = -EBUSY;
-                        goto abort;
-                }
-                p->rdev = NULL;
-                synchronize_rcu();
-                if (atomic_read(&rdev->nr_pending)) {
-                        /* lost the race, try later */
-                        err = -EBUSY;
-                        p->rdev = rdev;
-                }
-        }
-abort:
-        print_raid6_conf(conf);
-        return err;
-}
-static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
-{
-        raid6_conf_t *conf = mddev->private;
-        int found = 0;
-        int disk;
-        struct disk_info *p;
-        if (mddev->degraded > 2)
-                /* no point adding a device */
-                return 0;
-        /*
-         * find the disk ... but prefer rdev->saved_raid_disk
-         * if possible.
-         */
-        if (rdev->saved_raid_disk >= 0 &&
-            conf->disks[rdev->saved_raid_disk].rdev == NULL)
-                disk = rdev->saved_raid_disk;
-        else
-                disk = 0;
-        for ( ; disk < mddev->raid_disks; disk++)
-                if ((p=conf->disks + disk)->rdev == NULL) {
-                        clear_bit(In_sync, &rdev->flags);
-                        rdev->raid_disk = disk;
-                        found = 1;
-                        if (rdev->saved_raid_disk != disk)
-                                conf->fullsync = 1;
-                        rcu_assign_pointer(p->rdev, rdev);
-                        break;
-                }
-        print_raid6_conf(conf);
-        return found;
-}
-static int raid6_resize(mddev_t *mddev, sector_t sectors)
-{
-        /* no resync is happening, and there is enough space
-         * on all devices, so we can resize.
-         * We need to make sure resync covers any new space.
-         * If the array is shrinking we should possibly wait until
-         * any io in the removed space completes, but it hardly seems
-         * worth it.
-         */
-        sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
-        mddev->array_size = (sectors * (mddev->raid_disks-2))>>1;
-        set_capacity(mddev->gendisk, mddev->array_size << 1);
-        mddev->changed = 1;
-        if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
-                mddev->recovery_cp = mddev->size << 1;
-                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-        }
-        mddev->size = sectors /2;
-        mddev->resync_max_sectors = sectors;
-        return 0;
-}
-static void raid6_quiesce(mddev_t *mddev, int state)
-{
-        raid6_conf_t *conf = mddev_to_conf(mddev);
-        switch(state) {
-        case 1: /* stop all writes */
-                spin_lock_irq(&conf->device_lock);
-                conf->quiesce = 1;
-                wait_event_lock_irq(conf->wait_for_stripe,
-                                    atomic_read(&conf->active_stripes) == 0,
-                                    conf->device_lock, /* nothing */);
-                spin_unlock_irq(&conf->device_lock);
-                break;
-        case 0: /* re-enable writes */
-                spin_lock_irq(&conf->device_lock);
-                conf->quiesce = 0;
-                wake_up(&conf->wait_for_stripe);
-                spin_unlock_irq(&conf->device_lock);
-                break;
-        }
-}
-static struct mdk_personality raid6_personality =
-{
-        .name           = "raid6",
-        .level          = 6,
-        .owner          = THIS_MODULE,
-        .make_request   = make_request,
-        .run            = run,
-        .stop           = stop,
-        .status         = status,
-        .error_handler  = error,
-        .hot_add_disk   = raid6_add_disk,
-        .hot_remove_disk= raid6_remove_disk,
-        .spare_active   = raid6_spare_active,
-        .sync_request   = sync_request,
-        .resize         = raid6_resize,
-        .quiesce        = raid6_quiesce,
-};
-static int __init raid6_init(void)
-{
-        int e;
-        e = raid6_select_algo();
-        if ( e )
-                return e;
-        return register_md_personality(&raid6_personality);
-}
-static void raid6_exit (void)
-{
-        unregister_md_personality(&raid6_personality);
-}
-module_init(raid6_init);
-module_exit(raid6_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("md-personality-8"); /* RAID6 */
-MODULE_ALIAS("md-raid6");
-MODULE_ALIAS("md-level-6");