31 files changed, 1702 insertions, 1222 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 1c615804ea76..72880b7e28d9 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -3,9 +3,10 @@
 #
 dm-mod-objs     := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
-                   dm-ioctl.o dm-io.o dm-kcopyd.o
+                   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
 dm-multipath-objs := dm-path-selector.o dm-mpath.o
-dm-snapshot-objs := dm-snap.o dm-exception-store.o
+dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \
+                    dm-snap-persistent.o
 dm-mirror-objs  := dm-raid1.o
 md-mod-objs     := md.o bitmap.o
 raid456-objs    := raid5.o raid6algos.o raid6recov.o raid6tables.o \
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ab7c8e4a61f9..719943763391 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -215,7 +215,6 @@ static struct page *read_sb_page(mddev_t *mddev, long offset,
        /* choose a good rdev and read the page from there */
        mdk_rdev_t *rdev;
-        struct list_head *tmp;
        sector_t target;
        if (!page)
@@ -223,7 +222,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset,
        if (!page)
                return ERR_PTR(-ENOMEM);
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (! test_bit(In_sync, &rdev->flags)
                    || test_bit(Faulty, &rdev->flags))
                        continue;
@@ -964,9 +963,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
                                 */
                                page = bitmap->sb_page;
                                offset = sizeof(bitmap_super_t);
-                                read_sb_page(bitmap->mddev, bitmap->offset,
+                                if (!file)
-                                             page,
+                                        read_sb_page(bitmap->mddev,
-                                             index, count);
+                                                     bitmap->offset,
+                                                     page,
+                                                     index, count);
                        } else if (file) {
                                page = read_page(file, index, bitmap, count);
                                offset = 0;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ce26c84af064..35bda49796fb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1060,7 +1060,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad_page_pool;
        }
-        cc->bs = bioset_create(MIN_IOS, MIN_IOS);
+        cc->bs = bioset_create(MIN_IOS, 0);
        if (!cc->bs) {
                ti->error = "Cannot allocate crypt bioset";
                goto bad_bs;
@@ -1322,11 +1322,7 @@ static int __init dm_crypt_init(void)
 static void __exit dm_crypt_exit(void)
 {
-        int r = dm_unregister_target(&crypt_target);
+        dm_unregister_target(&crypt_target);
-        if (r < 0)
-                DMERR("unregister failed %d", r);
        kmem_cache_destroy(_crypt_io_pool);
 }
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 848b381f1173..59ee1b015d2d 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -364,11 +364,7 @@ bad_queue:
 static void __exit dm_delay_exit(void)
 {
-        int r = dm_unregister_target(&delay_target);
+        dm_unregister_target(&delay_target);
-        if (r < 0)
-                DMERR("unregister failed %d", r);
        kmem_cache_destroy(delayed_cache);
        destroy_workqueue(kdelayd_wq);
 }
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 01590f3e0009..dccbfb0e010f 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -1,756 +1,45 @@
 /*
- * dm-exception-store.c
- *
 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
- * Copyright (C) 2006 Red Hat GmbH
+ * Copyright (C) 2006-2008 Red Hat GmbH
 *
 * This file is released under the GPL.
 */
-#include "dm-snap.h"
+#include "dm-exception-store.h"
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
-#include <linux/dm-io.h>
-#include <linux/dm-kcopyd.h>
-#define DM_MSG_PREFIX "snapshots"
-#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32        /* 16KB */
-/*-----------------------------------------------------------------
- * Persistent snapshots, by persistent we mean that the snapshot
- * will survive a reboot.
- *---------------------------------------------------------------*/
-/*
- * We need to store a record of which parts of the origin have
- * been copied to the snapshot device.  The snapshot code
- * requires that we copy exception chunks to chunk aligned areas
- * of the COW store.  It makes sense therefore, to store the
- * metadata in chunk size blocks.
- *
- * There is no backward or forward compatibility implemented,
- * snapshots with different disk versions than the kernel will
- * not be usable.  It is expected that "lvcreate" will blank out
- * the start of a fresh COW device before calling the snapshot
- * constructor.
- *
- * The first chunk of the COW device just contains the header.
- * After this there is a chunk filled with exception metadata,
- * followed by as many exception chunks as can fit in the
- * metadata areas.
- *
- * All on disk structures are in little-endian format.  The end
- * of the exceptions info is indicated by an exception with a
- * new_chunk of 0, which is invalid since it would point to the
- * header chunk.
- */
-/*
- * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
- */
-#define SNAP_MAGIC 0x70416e53
-/*
- * The on-disk version of the metadata.
- */
-#define SNAPSHOT_DISK_VERSION 1
-struct disk_header {
-        uint32_t magic;
-        /*
-         * Is this snapshot valid.  There is no way of recovering
-         * an invalid snapshot.
-         */
-        uint32_t valid;
-        /*
-         * Simple, incrementing version. no backward
-         * compatibility.
-         */
-        uint32_t version;
-        /* In sectors */
-        uint32_t chunk_size;
-};
-struct disk_exception {
-        uint64_t old_chunk;
-        uint64_t new_chunk;
-};
-struct commit_callback {
-        void (*callback)(void *, int success);
-        void *context;
-};
-/*
- * The top level structure for a persistent exception store.
- */
-struct pstore {
-        struct dm_snapshot *snap;       /* up pointer to my snapshot */
-        int version;
-        int valid;
-        uint32_t exceptions_per_area;
-        /*
-         * Now that we have an asynchronous kcopyd there is no
-         * need for large chunk sizes, so it wont hurt to have a
-         * whole chunks worth of metadata in memory at once.
-         */
-        void *area;
-        /*
-         * An area of zeros used to clear the next area.
-         */
-        void *zero_area;
-        /*
-         * Used to keep track of which metadata area the data in
-         * 'chunk' refers to.
-         */
-        chunk_t current_area;
-        /*
-         * The next free chunk for an exception.
-         */
-        chunk_t next_free;
-        /*
-         * The index of next free exception in the current
-         * metadata area.
-         */
-        uint32_t current_committed;
-        atomic_t pending_count;
-        uint32_t callback_count;
-        struct commit_callback *callbacks;
-        struct dm_io_client *io_client;
-        struct workqueue_struct *metadata_wq;
-};
-static unsigned sectors_to_pages(unsigned sectors)
-{
-        return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
-}
-static int alloc_area(struct pstore *ps)
-{
-        int r = -ENOMEM;
-        size_t len;
-        len = ps->snap->chunk_size << SECTOR_SHIFT;
-        /*
-         * Allocate the chunk_size block of memory that will hold
-         * a single metadata area.
-         */
-        ps->area = vmalloc(len);
-        if (!ps->area)
-                return r;
-        ps->zero_area = vmalloc(len);
-        if (!ps->zero_area) {
-                vfree(ps->area);
-                return r;
-        }
-        memset(ps->zero_area, 0, len);
-        return 0;
-}
-static void free_area(struct pstore *ps)
-{
-        vfree(ps->area);
-        ps->area = NULL;
-        vfree(ps->zero_area);
-        ps->zero_area = NULL;
-}
-struct mdata_req {
-        struct dm_io_region *where;
-        struct dm_io_request *io_req;
-        struct work_struct work;
-        int result;
-};
-static void do_metadata(struct work_struct *work)
-{
-        struct mdata_req *req = container_of(work, struct mdata_req, work);
-        req->result = dm_io(req->io_req, 1, req->where, NULL);
-}
-/*
- * Read or write a chunk aligned and sized block of data from a device.
- */
-static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
-{
-        struct dm_io_region where = {
-                .bdev = ps->snap->cow->bdev,
-                .sector = ps->snap->chunk_size * chunk,
-                .count = ps->snap->chunk_size,
-        };
-        struct dm_io_request io_req = {
-                .bi_rw = rw,
-                .mem.type = DM_IO_VMA,
-                .mem.ptr.vma = ps->area,
-                .client = ps->io_client,
-                .notify.fn = NULL,
-        };
-        struct mdata_req req;
-        if (!metadata)
-                return dm_io(&io_req, 1, &where, NULL);
-        req.where = &where;
-        req.io_req = &io_req;
-        /*
-         * Issue the synchronous I/O from a different thread
-         * to avoid generic_make_request recursion.
-         */
-        INIT_WORK(&req.work, do_metadata);
-        queue_work(ps->metadata_wq, &req.work);
-        flush_workqueue(ps->metadata_wq);
-        return req.result;
-}
-/*
- * Convert a metadata area index to a chunk index.
- */
-static chunk_t area_location(struct pstore *ps, chunk_t area)
-{
-        return 1 + ((ps->exceptions_per_area + 1) * area);
-}
-/*
- * Read or write a metadata area.  Remembering to skip the first
- * chunk which holds the header.
- */
-static int area_io(struct pstore *ps, int rw)
-{
-        int r;
-        chunk_t chunk;
-        chunk = area_location(ps, ps->current_area);
-        r = chunk_io(ps, chunk, rw, 0);
-        if (r)
-                return r;
-        return 0;
-}
-static void zero_memory_area(struct pstore *ps)
-{
-        memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
-}
-static int zero_disk_area(struct pstore *ps, chunk_t area)
-{
-        struct dm_io_region where = {
-                .bdev = ps->snap->cow->bdev,
-                .sector = ps->snap->chunk_size * area_location(ps, area),
-                .count = ps->snap->chunk_size,
-        };
-        struct dm_io_request io_req = {
-                .bi_rw = WRITE,
-                .mem.type = DM_IO_VMA,
-                .mem.ptr.vma = ps->zero_area,
-                .client = ps->io_client,
-                .notify.fn = NULL,
-        };
-        return dm_io(&io_req, 1, &where, NULL);
-}
-static int read_header(struct pstore *ps, int *new_snapshot)
-{
-        int r;
-        struct disk_header *dh;
-        chunk_t chunk_size;
-        int chunk_size_supplied = 1;
-        /*
-         * Use default chunk size (or hardsect_size, if larger) if none supplied
-         */
-        if (!ps->snap->chunk_size) {
-                ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
-                    bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
-                ps->snap->chunk_mask = ps->snap->chunk_size - 1;
-                ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
-                chunk_size_supplied = 0;
-        }
-        ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
-                                                             chunk_size));
-        if (IS_ERR(ps->io_client))
-                return PTR_ERR(ps->io_client);
-        r = alloc_area(ps);
-        if (r)
-                return r;
-        r = chunk_io(ps, 0, READ, 1);
-        if (r)
-                goto bad;
-        dh = (struct disk_header *) ps->area;
-        if (le32_to_cpu(dh->magic) == 0) {
-                *new_snapshot = 1;
-                return 0;
-        }
-        if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
-                DMWARN("Invalid or corrupt snapshot");
-                r = -ENXIO;
-                goto bad;
-        }
-        *new_snapshot = 0;
-        ps->valid = le32_to_cpu(dh->valid);
-        ps->version = le32_to_cpu(dh->version);
-        chunk_size = le32_to_cpu(dh->chunk_size);
-        if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
-                return 0;
-        DMWARN("chunk size %llu in device metadata overrides "
-               "table chunk size of %llu.",
-               (unsigned long long)chunk_size,
-               (unsigned long long)ps->snap->chunk_size);
-        /* We had a bogus chunk_size. Fix stuff up. */
-        free_area(ps);
-        ps->snap->chunk_size = chunk_size;
-        ps->snap->chunk_mask = chunk_size - 1;
-        ps->snap->chunk_shift = ffs(chunk_size) - 1;
-        r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
-                                ps->io_client);
-        if (r)
-                return r;
-        r = alloc_area(ps);
-        return r;
-bad:
-        free_area(ps);
-        return r;
-}
-static int write_header(struct pstore *ps)
-{
-        struct disk_header *dh;
-        memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
-        dh = (struct disk_header *) ps->area;
-        dh->magic = cpu_to_le32(SNAP_MAGIC);
-        dh->valid = cpu_to_le32(ps->valid);
-        dh->version = cpu_to_le32(ps->version);
-        dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
-        return chunk_io(ps, 0, WRITE, 1);
-}
-/*
- * Access functions for the disk exceptions, these do the endian conversions.
- */
-static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
-{
-        BUG_ON(index >= ps->exceptions_per_area);
-        return ((struct disk_exception *) ps->area) + index;
-}
-static void read_exception(struct pstore *ps,
+#define DM_MSG_PREFIX "snapshot exception stores"
-                           uint32_t index, struct disk_exception *result)
-{
-        struct disk_exception *e = get_exception(ps, index);
-        /* copy it */
-        result->old_chunk = le64_to_cpu(e->old_chunk);
-        result->new_chunk = le64_to_cpu(e->new_chunk);
-}
-static void write_exception(struct pstore *ps,
-                            uint32_t index, struct disk_exception *de)
-{
-        struct disk_exception *e = get_exception(ps, index);
-        /* copy it */
-        e->old_chunk = cpu_to_le64(de->old_chunk);
-        e->new_chunk = cpu_to_le64(de->new_chunk);
-}
-/*
+int dm_exception_store_init(void)
- * Registers the exceptions that are present in the current area.
- * 'full' is filled in to indicate if the area has been
- * filled.
- */
-static int insert_exceptions(struct pstore *ps, int *full)
 {
        int r;
-        unsigned int i;
-        struct disk_exception de;
-        /* presume the area is full */
-        *full = 1;
-        for (i = 0; i < ps->exceptions_per_area; i++) {
-                read_exception(ps, i, &de);
-                /*
-                 * If the new_chunk is pointing at the start of
-                 * the COW device, where the first metadata area
-                 * is we know that we've hit the end of the
-                 * exceptions.  Therefore the area is not full.
-                 */
-                if (de.new_chunk == 0LL) {
-                        ps->current_committed = i;
-                        *full = 0;
-                        break;
-                }
-                /*
-                 * Keep track of the start of the free chunks.
-                 */
-                if (ps->next_free <= de.new_chunk)
-                        ps->next_free = de.new_chunk + 1;
-                /*
-                 * Otherwise we add the exception to the snapshot.
-                 */
-                r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
-                if (r)
-                        return r;
-        }
-        return 0;
-}
-static int read_exceptions(struct pstore *ps)
-{
-        int r, full = 1;
-        /*
-         * Keeping reading chunks and inserting exceptions until
-         * we find a partially full area.
-         */
-        for (ps->current_area = 0; full; ps->current_area++) {
-                r = area_io(ps, READ);
-                if (r)
-                        return r;
-                r = insert_exceptions(ps, &full);
+        r = dm_transient_snapshot_init();
-                if (r)
+        if (r) {
-                        return r;
+                DMERR("Unable to register transient exception store type.");
+                goto transient_fail;
        }
-        ps->current_area--;
+        r = dm_persistent_snapshot_init();
+        if (r) {
-        return 0;
+                DMERR("Unable to register persistent exception store type");
-}
+                goto persistent_fail;
-static struct pstore *get_info(struct exception_store *store)
-{
-        return (struct pstore *) store->context;
-}
-static void persistent_fraction_full(struct exception_store *store,
-                                     sector_t *numerator, sector_t *denominator)
-{
-        *numerator = get_info(store)->next_free * store->snap->chunk_size;
-        *denominator = get_dev_size(store->snap->cow->bdev);
-}
-static void persistent_destroy(struct exception_store *store)
-{
-        struct pstore *ps = get_info(store);
-        destroy_workqueue(ps->metadata_wq);
-        dm_io_client_destroy(ps->io_client);
-        vfree(ps->callbacks);
-        free_area(ps);
-        kfree(ps);
-}
-static int persistent_read_metadata(struct exception_store *store)
-{
-        int r, uninitialized_var(new_snapshot);
-        struct pstore *ps = get_info(store);
-        /*
-         * Read the snapshot header.
-         */
-        r = read_header(ps, &new_snapshot);
-        if (r)
-                return r;
-        /*
-         * Now we know correct chunk_size, complete the initialisation.
-         */
-        ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
-                                  sizeof(struct disk_exception);
-        ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
-                        sizeof(*ps->callbacks));
-        if (!ps->callbacks)
-                return -ENOMEM;
-        /*
-         * Do we need to setup a new snapshot ?
-         */
-        if (new_snapshot) {
-                r = write_header(ps);
-                if (r) {
-                        DMWARN("write_header failed");
-                        return r;
-                }
-                ps->current_area = 0;
-                zero_memory_area(ps);
-                r = zero_disk_area(ps, 0);
-                if (r) {
-                        DMWARN("zero_disk_area(0) failed");
-                        return r;
-                }
-        } else {
-                /*
-                 * Sanity checks.
-                 */
-                if (ps->version != SNAPSHOT_DISK_VERSION) {
-                        DMWARN("unable to handle snapshot disk version %d",
-                               ps->version);
-                        return -EINVAL;
-                }
-                /*
-                 * Metadata are valid, but snapshot is invalidated
-                 */
-                if (!ps->valid)
-                        return 1;
-                /*
-                 * Read the metadata.
-                 */
-                r = read_exceptions(ps);
-                if (r)
-                        return r;
        }
        return 0;
-}
-static int persistent_prepare(struct exception_store *store,
-                              struct dm_snap_exception *e)
-{
-        struct pstore *ps = get_info(store);
-        uint32_t stride;
-        chunk_t next_free;
-        sector_t size = get_dev_size(store->snap->cow->bdev);
-        /* Is there enough room ? */
-        if (size < ((ps->next_free + 1) * store->snap->chunk_size))
-                return -ENOSPC;
-        e->new_chunk = ps->next_free;
+persistent_fail:
+        dm_persistent_snapshot_exit();
-        /*
+transient_fail:
-         * Move onto the next free pending, making sure to take
+        return r;
-         * into account the location of the metadata chunks.
-         */
-        stride = (ps->exceptions_per_area + 1);
-        next_free = ++ps->next_free;
-        if (sector_div(next_free, stride) == 1)
-                ps->next_free++;
-        atomic_inc(&ps->pending_count);
-        return 0;
-}
-static void persistent_commit(struct exception_store *store,
-                              struct dm_snap_exception *e,
-                              void (*callback) (void *, int success),
-                              void *callback_context)
-{
-        unsigned int i;
-        struct pstore *ps = get_info(store);
-        struct disk_exception de;
-        struct commit_callback *cb;
-        de.old_chunk = e->old_chunk;
-        de.new_chunk = e->new_chunk;
-        write_exception(ps, ps->current_committed++, &de);
-        /*
-         * Add the callback to the back of the array.  This code
-         * is the only place where the callback array is
-         * manipulated, and we know that it will never be called
-         * multiple times concurrently.
-         */
-        cb = ps->callbacks + ps->callback_count++;
-        cb->callback = callback;
-        cb->context = callback_context;
-        /*
-         * If there are exceptions in flight and we have not yet
-         * filled this metadata area there's nothing more to do.
-         */
-        if (!atomic_dec_and_test(&ps->pending_count) &&
-            (ps->current_committed != ps->exceptions_per_area))
-                return;
-        /*
-         * If we completely filled the current area, then wipe the next one.
-         */
-        if ((ps->current_committed == ps->exceptions_per_area) &&
-             zero_disk_area(ps, ps->current_area + 1))
-                ps->valid = 0;
-        /*
-         * Commit exceptions to disk.
-         */
-        if (ps->valid && area_io(ps, WRITE))
-                ps->valid = 0;
-        /*
-         * Advance to the next area if this one is full.
-         */
-        if (ps->current_committed == ps->exceptions_per_area) {
-                ps->current_committed = 0;
-                ps->current_area++;
-                zero_memory_area(ps);
-        }
-        for (i = 0; i < ps->callback_count; i++) {
-                cb = ps->callbacks + i;
-                cb->callback(cb->context, ps->valid);
-        }
-        ps->callback_count = 0;
-}
-static void persistent_drop(struct exception_store *store)
-{
-        struct pstore *ps = get_info(store);
-        ps->valid = 0;
-        if (write_header(ps))
-                DMWARN("write header failed");
-}
-int dm_create_persistent(struct exception_store *store)
-{
-        struct pstore *ps;
-        /* allocate the pstore */
-        ps = kmalloc(sizeof(*ps), GFP_KERNEL);
-        if (!ps)
-                return -ENOMEM;
-        ps->snap = store->snap;
-        ps->valid = 1;
-        ps->version = SNAPSHOT_DISK_VERSION;
-        ps->area = NULL;
-        ps->next_free = 2;      /* skipping the header and first area */
-        ps->current_committed = 0;
-        ps->callback_count = 0;
-        atomic_set(&ps->pending_count, 0);
-        ps->callbacks = NULL;
-        ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
-        if (!ps->metadata_wq) {
-                kfree(ps);
-                DMERR("couldn't start header metadata update thread");
-                return -ENOMEM;
-        }
-        store->destroy = persistent_destroy;
-        store->read_metadata = persistent_read_metadata;
-        store->prepare_exception = persistent_prepare;
-        store->commit_exception = persistent_commit;
-        store->drop_snapshot = persistent_drop;
-        store->fraction_full = persistent_fraction_full;
-        store->context = ps;
-        return 0;
-}
-/*-----------------------------------------------------------------
- * Implementation of the store for non-persistent snapshots.
- *---------------------------------------------------------------*/
-struct transient_c {
-        sector_t next_free;
-};
-static void transient_destroy(struct exception_store *store)
-{
-        kfree(store->context);
-}
-static int transient_read_metadata(struct exception_store *store)
-{
-        return 0;
-}
-static int transient_prepare(struct exception_store *store,
-                             struct dm_snap_exception *e)
-{
-        struct transient_c *tc = (struct transient_c *) store->context;
-        sector_t size = get_dev_size(store->snap->cow->bdev);
-        if (size < (tc->next_free + store->snap->chunk_size))
-                return -1;
-        e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
-        tc->next_free += store->snap->chunk_size;
-        return 0;
-}
-static void transient_commit(struct exception_store *store,
-                             struct dm_snap_exception *e,
-                             void (*callback) (void *, int success),
-                             void *callback_context)
-{
-        /* Just succeed */
-        callback(callback_context, 1);
-}
-static void transient_fraction_full(struct exception_store *store,
-                                    sector_t *numerator, sector_t *denominator)
-{
-        *numerator = ((struct transient_c *) store->context)->next_free;
-        *denominator = get_dev_size(store->snap->cow->bdev);
 }
-int dm_create_transient(struct exception_store *store)
+void dm_exception_store_exit(void)
 {
-        struct transient_c *tc;
+        dm_persistent_snapshot_exit();
+        dm_transient_snapshot_exit();
-        store->destroy = transient_destroy;
-        store->read_metadata = transient_read_metadata;
-        store->prepare_exception = transient_prepare;
-        store->commit_exception = transient_commit;
-        store->drop_snapshot = NULL;
-        store->fraction_full = transient_fraction_full;
-        tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
-        if (!tc)
-                return -ENOMEM;
-        tc->next_free = 0;
-        store->context = tc;
-        return 0;
 }
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
new file mode 100644
index 000000000000..bb9f33d5daa2
--- /dev/null
+++ b/drivers/md/dm-exception-store.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
+ *
+ * Device-mapper snapshot exception store.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef _LINUX_DM_EXCEPTION_STORE
+#define _LINUX_DM_EXCEPTION_STORE
+#include <linux/blkdev.h>
+#include <linux/device-mapper.h>
+/*
+ * The snapshot code deals with largish chunks of the disk at a
+ * time. Typically 32k - 512k.
+ */
+typedef sector_t chunk_t;
+/*
+ * An exception is used where an old chunk of data has been
+ * replaced by a new one.
+ * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
+ * of chunks that follow contiguously.  Remaining bits hold the number of the
+ * chunk within the device.
+ */
+struct dm_snap_exception {
+        struct list_head hash_list;
+        chunk_t old_chunk;
+        chunk_t new_chunk;
+};
+/*
+ * Abstraction to handle the meta/layout of exception stores (the
+ * COW device).
+ */
+struct dm_exception_store {
+        /*
+         * Destroys this object when you've finished with it.
+         */
+        void (*destroy) (struct dm_exception_store *store);
+        /*
+         * The target shouldn't read the COW device until this is
+         * called.  As exceptions are read from the COW, they are
+         * reported back via the callback.
+         */
+        int (*read_metadata) (struct dm_exception_store *store,
+                              int (*callback)(void *callback_context,
+                                              chunk_t old, chunk_t new),
+                              void *callback_context);
+        /*
+         * Find somewhere to store the next exception.
+         */
+        int (*prepare_exception) (struct dm_exception_store *store,
+                                  struct dm_snap_exception *e);
+        /*
+         * Update the metadata with this exception.
+         */
+        void (*commit_exception) (struct dm_exception_store *store,
+                                  struct dm_snap_exception *e,
+                                  void (*callback) (void *, int success),
+                                  void *callback_context);
+        /*
+         * The snapshot is invalid, note this in the metadata.
+         */
+        void (*drop_snapshot) (struct dm_exception_store *store);
+        int (*status) (struct dm_exception_store *store, status_type_t status,
+                       char *result, unsigned int maxlen);
+        /*
+         * Return how full the snapshot is.
+         */
+        void (*fraction_full) (struct dm_exception_store *store,
+                               sector_t *numerator,
+                               sector_t *denominator);
+        struct dm_snapshot *snap;
+        void *context;
+};
+/*
+ * Funtions to manipulate consecutive chunks
+ */
+#  if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
+#    define DM_CHUNK_CONSECUTIVE_BITS 8
+#    define DM_CHUNK_NUMBER_BITS 56
+static inline chunk_t dm_chunk_number(chunk_t chunk)
+{
+        return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
+}
+static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
+{
+        return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
+}
+static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
+{
+        e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
+        BUG_ON(!dm_consecutive_chunk_count(e));
+}
+#  else
+#    define DM_CHUNK_CONSECUTIVE_BITS 0
+static inline chunk_t dm_chunk_number(chunk_t chunk)
+{
+        return chunk;
+}
+static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
+{
+        return 0;
+}
+static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
+{
+}
+#  endif
+int dm_exception_store_init(void);
+void dm_exception_store_exit(void);
+/*
+ * Two exception store implementations.
+ */
+int dm_persistent_snapshot_init(void);
+void dm_persistent_snapshot_exit(void);
+int dm_transient_snapshot_init(void);
+void dm_transient_snapshot_exit(void);
+int dm_create_persistent(struct dm_exception_store *store);
+int dm_create_transient(struct dm_exception_store *store);
+#endif /* _LINUX_DM_EXCEPTION_STORE */
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2fd6d4450637..a34338567a2a 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -56,7 +56,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
        if (!client->pool)
                goto bad;
-        client->bios = bioset_create(16, 16);
+        client->bios = bioset_create(16, 0);
        if (!client->bios)
                goto bad;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 777c948180f9..54d0588fc1f6 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -233,7 +233,7 @@ static void __hash_remove(struct hash_cell *hc)
        }
        if (hc->new_map)
-                dm_table_put(hc->new_map);
+                dm_table_destroy(hc->new_map);
        dm_put(hc->md);
        free_cell(hc);
 }
@@ -827,8 +827,8 @@ static int do_resume(struct dm_ioctl *param)
                r = dm_swap_table(md, new_map);
                if (r) {
+                        dm_table_destroy(new_map);
                        dm_put(md);
-                        dm_table_put(new_map);
                        return r;
                }
@@ -836,8 +836,6 @@ static int do_resume(struct dm_ioctl *param)
                        set_disk_ro(dm_disk(md), 0);
                else
                        set_disk_ro(dm_disk(md), 1);
-                dm_table_put(new_map);
        }
        if (dm_suspended(md))
@@ -1080,7 +1078,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
        }
        if (hc->new_map)
-                dm_table_put(hc->new_map);
+                dm_table_destroy(hc->new_map);
        hc->new_map = t;
        up_write(&_hash_lock);
@@ -1109,7 +1107,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
        }
        if (hc->new_map) {
-                dm_table_put(hc->new_map);
+                dm_table_destroy(hc->new_map);
                hc->new_map = NULL;
        }
@@ -1550,8 +1548,10 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
                goto out;
        }
-        strcpy(name, hc->name);
+        if (name)
-        strcpy(uuid, hc->uuid ? : "");
+                strcpy(name, hc->name);
+        if (uuid)
+                strcpy(uuid, hc->uuid ? : "");
 out:
        up_read(&_hash_lock);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 44042becad8a..bfa107f59d96 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -142,6 +142,7 @@ static struct target_type linear_target = {
        .status = linear_status,
        .ioctl  = linear_ioctl,
        .merge  = linear_merge,
+        .features = DM_TARGET_SUPPORTS_BARRIERS,
 };
 int __init dm_linear_init(void)
@@ -156,8 +157,5 @@ int __init dm_linear_init(void)
 void dm_linear_exit(void)
 {
-        int r = dm_unregister_target(&linear_target);
+        dm_unregister_target(&linear_target);
-        if (r < 0)
-                DMERR("unregister failed %d", r);
 }
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index a8c0fc79ca78..737961f275c1 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -326,8 +326,6 @@ static void header_from_disk(struct log_header *core, struct log_header *disk)
 static int rw_header(struct log_c *lc, int rw)
 {
        lc->io_req.bi_rw = rw;
-        lc->io_req.mem.ptr.vma = lc->disk_header;
-        lc->io_req.notify.fn = NULL;
        return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
 }
@@ -362,10 +360,15 @@ static int read_header(struct log_c *log)
        return 0;
 }
-static inline int write_header(struct log_c *log)
+static int _check_region_size(struct dm_target *ti, uint32_t region_size)
 {
-        header_to_disk(&log->header, log->disk_header);
+        if (region_size < 2 || region_size > ti->len)
-        return rw_header(log, WRITE);
+                return 0;
+        if (!is_power_of_2(region_size))
+                return 0;
+        return 1;
 }
 /*----------------------------------------------------------------
@@ -403,8 +406,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                }
        }
-        if (sscanf(argv[0], "%u", &region_size) != 1) {
+        if (sscanf(argv[0], "%u", &region_size) != 1 ||
-                DMWARN("invalid region size string");
+            !_check_region_size(ti, region_size)) {
+                DMWARN("invalid region size %s", argv[0]);
                return -EINVAL;
        }
@@ -453,8 +457,18 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                 */
                buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
                                       bitset_size, ti->limits.hardsect_size);
+                if (buf_size > dev->bdev->bd_inode->i_size) {
+                        DMWARN("log device %s too small: need %llu bytes",
+                                dev->name, (unsigned long long)buf_size);
+                        kfree(lc);
+                        return -EINVAL;
+                }
                lc->header_location.count = buf_size >> SECTOR_SHIFT;
                lc->io_req.mem.type = DM_IO_VMA;
+                lc->io_req.notify.fn = NULL;
                lc->io_req.client = dm_io_client_create(dm_div_up(buf_size,
                                                                   PAGE_SIZE));
                if (IS_ERR(lc->io_req.client)) {
@@ -467,10 +481,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                lc->disk_header = vmalloc(buf_size);
                if (!lc->disk_header) {
                        DMWARN("couldn't allocate disk log buffer");
+                        dm_io_client_destroy(lc->io_req.client);
                        kfree(lc);
                        return -ENOMEM;
                }
+                lc->io_req.mem.ptr.vma = lc->disk_header;
                lc->clean_bits = (void *)lc->disk_header +
                                 (LOG_OFFSET << SECTOR_SHIFT);
        }
@@ -482,6 +498,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                DMWARN("couldn't allocate sync bitset");
                if (!dev)
                        vfree(lc->clean_bits);
+                else
+                        dm_io_client_destroy(lc->io_req.client);
                vfree(lc->disk_header);
                kfree(lc);
                return -ENOMEM;
@@ -495,6 +513,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                vfree(lc->sync_bits);
                if (!dev)
                        vfree(lc->clean_bits);
+                else
+                        dm_io_client_destroy(lc->io_req.client);
                vfree(lc->disk_header);
                kfree(lc);
                return -ENOMEM;
@@ -631,8 +651,10 @@ static int disk_resume(struct dm_dirty_log *log)
        /* set the correct number of regions in the header */
        lc->header.nr_regions = lc->region_count;
+        header_to_disk(&lc->header, lc->disk_header);
        /* write the new header */
-        r = write_header(lc);
+        r = rw_header(lc, WRITE);
        if (r) {
                DMWARN("%s: Failed to write header on dirty region log device",
                       lc->log_dev->name);
@@ -682,7 +704,7 @@ static int disk_flush(struct dm_dirty_log *log)
        if (!lc->touched)
                return 0;
-        r = write_header(lc);
+        r = rw_header(lc, WRITE);
        if (r)
                fail_log_device(lc);
        else
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 3d7f4923cd13..095f77bf9681 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -889,7 +889,7 @@ static int fail_path(struct pgpath *pgpath)
        dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
                      pgpath->path.dev->name, m->nr_valid_paths);
-        queue_work(kmultipathd, &m->trigger_event);
+        schedule_work(&m->trigger_event);
        queue_work(kmultipathd, &pgpath->deactivate_path);
 out:
@@ -932,7 +932,7 @@ static int reinstate_path(struct pgpath *pgpath)
        dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
                      pgpath->path.dev->name, m->nr_valid_paths);
-        queue_work(kmultipathd, &m->trigger_event);
+        schedule_work(&m->trigger_event);
 out:
        spin_unlock_irqrestore(&m->lock, flags);
@@ -976,7 +976,7 @@ static void bypass_pg(struct multipath *m, struct priority_group *pg,
        spin_unlock_irqrestore(&m->lock, flags);
-        queue_work(kmultipathd, &m->trigger_event);
+        schedule_work(&m->trigger_event);
 }
 /*
@@ -1006,7 +1006,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
        }
        spin_unlock_irqrestore(&m->lock, flags);
-        queue_work(kmultipathd, &m->trigger_event);
+        schedule_work(&m->trigger_event);
        return 0;
 }
@@ -1495,14 +1495,10 @@ static int __init dm_multipath_init(void)
 static void __exit dm_multipath_exit(void)
 {
-        int r;
        destroy_workqueue(kmpath_handlerd);
        destroy_workqueue(kmultipathd);
-        r = dm_unregister_target(&multipath_target);
+        dm_unregister_target(&multipath_target);
-        if (r < 0)
-                DMERR("target unregister failed %d", r);
        kmem_cache_destroy(_mpio_cache);
 }
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ec43f9fa4b2a..4d6bc101962e 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -197,9 +197,6 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
        struct mirror_set *ms = m->ms;
        struct mirror *new;
-        if (!errors_handled(ms))
-                return;
        /*
         * error_count is used for nothing more than a
         * simple way to tell if a device has encountered
@@ -210,6 +207,9 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
        if (test_and_set_bit(error_type, &m->error_type))
                return;
+        if (!errors_handled(ms))
+                return;
        if (m != get_default_mirror(ms))
                goto out;
@@ -808,12 +808,6 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
        kfree(ms);
 }
-static inline int _check_region_size(struct dm_target *ti, uint32_t size)
-{
-        return !(size % (PAGE_SIZE >> 9) || !is_power_of_2(size) ||
-                 size > ti->len);
-}
 static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
                      unsigned int mirror, char **argv)
 {
@@ -872,12 +866,6 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
                return NULL;
        }
-        if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
-                ti->error = "Invalid region size";
-                dm_dirty_log_destroy(dl);
-                return NULL;
-        }
        return dl;
 }
@@ -1300,11 +1288,7 @@ static int __init dm_mirror_init(void)
 static void __exit dm_mirror_exit(void)
 {
-        int r;
+        dm_unregister_target(&mirror_target);
-        r = dm_unregister_target(&mirror_target);
-        if (r < 0)
-                DMERR("unregister failed %d", r);
 }
 /* Module hooks */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
new file mode 100644
index 000000000000..936b34e0959f
--- /dev/null
+++ b/drivers/md/dm-snap-persistent.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006-2008 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ */
+#include "dm-exception-store.h"
+#include "dm-snap.h"
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/dm-io.h>
+#define DM_MSG_PREFIX "persistent snapshot"
+#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32        /* 16KB */
+/*-----------------------------------------------------------------
+ * Persistent snapshots, by persistent we mean that the snapshot
+ * will survive a reboot.
+ *---------------------------------------------------------------*/
+/*
+ * We need to store a record of which parts of the origin have
+ * been copied to the snapshot device.  The snapshot code
+ * requires that we copy exception chunks to chunk aligned areas
+ * of the COW store.  It makes sense therefore, to store the
+ * metadata in chunk size blocks.
+ *
+ * There is no backward or forward compatibility implemented,
+ * snapshots with different disk versions than the kernel will
+ * not be usable.  It is expected that "lvcreate" will blank out
+ * the start of a fresh COW device before calling the snapshot
+ * constructor.
+ *
+ * The first chunk of the COW device just contains the header.
+ * After this there is a chunk filled with exception metadata,
+ * followed by as many exception chunks as can fit in the
+ * metadata areas.
+ *
+ * All on disk structures are in little-endian format.  The end
+ * of the exceptions info is indicated by an exception with a
+ * new_chunk of 0, which is invalid since it would point to the
+ * header chunk.
+ */
+/*
+ * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
+ */
+#define SNAP_MAGIC 0x70416e53
+/*
+ * The on-disk version of the metadata.
+ */
+#define SNAPSHOT_DISK_VERSION 1
+struct disk_header {
+        uint32_t magic;
+        /*
+         * Is this snapshot valid.  There is no way of recovering
+         * an invalid snapshot.
+         */
+        uint32_t valid;
+        /*
+         * Simple, incrementing version. no backward
+         * compatibility.
+         */
+        uint32_t version;
+        /* In sectors */
+        uint32_t chunk_size;
+};
+struct disk_exception {
+        uint64_t old_chunk;
+        uint64_t new_chunk;
+};
+struct commit_callback {
+        void (*callback)(void *, int success);
+        void *context;
+};
+/*
+ * The top level structure for a persistent exception store.
+ */
+struct pstore {
+        struct dm_snapshot *snap;       /* up pointer to my snapshot */
+        int version;
+        int valid;
+        uint32_t exceptions_per_area;
+        /*
+         * Now that we have an asynchronous kcopyd there is no
+         * need for large chunk sizes, so it wont hurt to have a
+         * whole chunks worth of metadata in memory at once.
+         */
+        void *area;
+        /*
+         * An area of zeros used to clear the next area.
+         */
+        void *zero_area;
+        /*
+         * Used to keep track of which metadata area the data in
+         * 'chunk' refers to.
+         */
+        chunk_t current_area;
+        /*
+         * The next free chunk for an exception.
+         */
+        chunk_t next_free;
+        /*
+         * The index of next free exception in the current
+         * metadata area.
+         */
+        uint32_t current_committed;
+        atomic_t pending_count;
+        uint32_t callback_count;
+        struct commit_callback *callbacks;
+        struct dm_io_client *io_client;
+        struct workqueue_struct *metadata_wq;
+};
+static unsigned sectors_to_pages(unsigned sectors)
+{
+        return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
+}
+static int alloc_area(struct pstore *ps)
+{
+        int r = -ENOMEM;
+        size_t len;
+        len = ps->snap->chunk_size << SECTOR_SHIFT;
+        /*
+         * Allocate the chunk_size block of memory that will hold
+         * a single metadata area.
+         */
+        ps->area = vmalloc(len);
+        if (!ps->area)
+                return r;
+        ps->zero_area = vmalloc(len);
+        if (!ps->zero_area) {
+                vfree(ps->area);
+                return r;
+        }
+        memset(ps->zero_area, 0, len);
+        return 0;
+}
+static void free_area(struct pstore *ps)
+{
+        vfree(ps->area);
+        ps->area = NULL;
+        vfree(ps->zero_area);
+        ps->zero_area = NULL;
+}
+struct mdata_req {
+        struct dm_io_region *where;
+        struct dm_io_request *io_req;
+        struct work_struct work;
+        int result;
+};
+static void do_metadata(struct work_struct *work)
+{
+        struct mdata_req *req = container_of(work, struct mdata_req, work);
+        req->result = dm_io(req->io_req, 1, req->where, NULL);
+}
+/*
+ * Read or write a chunk aligned and sized block of data from a device.
+ */
+static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
+{
+        struct dm_io_region where = {
+                .bdev = ps->snap->cow->bdev,
+                .sector = ps->snap->chunk_size * chunk,
+                .count = ps->snap->chunk_size,
+        };
+        struct dm_io_request io_req = {
+                .bi_rw = rw,
+                .mem.type = DM_IO_VMA,
+                .mem.ptr.vma = ps->area,
+                .client = ps->io_client,
+                .notify.fn = NULL,
+        };
+        struct mdata_req req;
+        if (!metadata)
+                return dm_io(&io_req, 1, &where, NULL);
+        req.where = &where;
+        req.io_req = &io_req;
+        /*
+         * Issue the synchronous I/O from a different thread
+         * to avoid generic_make_request recursion.
+         */
+        INIT_WORK(&req.work, do_metadata);
+        queue_work(ps->metadata_wq, &req.work);
+        flush_workqueue(ps->metadata_wq);
+        return req.result;
+}
+/*
+ * Convert a metadata area index to a chunk index.
+ */
+static chunk_t area_location(struct pstore *ps, chunk_t area)
+{
+        return 1 + ((ps->exceptions_per_area + 1) * area);
+}
+/*
+ * Read or write a metadata area.  Remembering to skip the first
+ * chunk which holds the header.
+ */
+static int area_io(struct pstore *ps, int rw)
+{
+        int r;
+        chunk_t chunk;
+        chunk = area_location(ps, ps->current_area);
+        r = chunk_io(ps, chunk, rw, 0);
+        if (r)
+                return r;
+        return 0;
+}
+static void zero_memory_area(struct pstore *ps)
+{
+        memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
+}
+static int zero_disk_area(struct pstore *ps, chunk_t area)
+{
+        struct dm_io_region where = {
+                .bdev = ps->snap->cow->bdev,
+                .sector = ps->snap->chunk_size * area_location(ps, area),
+                .count = ps->snap->chunk_size,
+        };
+        struct dm_io_request io_req = {
+                .bi_rw = WRITE,
+                .mem.type = DM_IO_VMA,
+                .mem.ptr.vma = ps->zero_area,
+                .client = ps->io_client,
+                .notify.fn = NULL,
+        };
+        return dm_io(&io_req, 1, &where, NULL);
+}
+static int read_header(struct pstore *ps, int *new_snapshot)
+{
+        int r;
+        struct disk_header *dh;
+        chunk_t chunk_size;
+        int chunk_size_supplied = 1;
+        /*
+         * Use default chunk size (or hardsect_size, if larger) if none supplied
+         */
+        if (!ps->snap->chunk_size) {
+                ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
+                    bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
+                ps->snap->chunk_mask = ps->snap->chunk_size - 1;
+                ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
+                chunk_size_supplied = 0;
+        }
+        ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
+                                                             chunk_size));
+        if (IS_ERR(ps->io_client))
+                return PTR_ERR(ps->io_client);
+        r = alloc_area(ps);
+        if (r)
+                return r;
+        r = chunk_io(ps, 0, READ, 1);
+        if (r)
+                goto bad;
+        dh = (struct disk_header *) ps->area;
+        if (le32_to_cpu(dh->magic) == 0) {
+                *new_snapshot = 1;
+                return 0;
+        }
+        if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
+                DMWARN("Invalid or corrupt snapshot");
+                r = -ENXIO;
+                goto bad;
+        }
+        *new_snapshot = 0;
+        ps->valid = le32_to_cpu(dh->valid);
+        ps->version = le32_to_cpu(dh->version);
+        chunk_size = le32_to_cpu(dh->chunk_size);
+        if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
+                return 0;
+        DMWARN("chunk size %llu in device metadata overrides "
+               "table chunk size of %llu.",
+               (unsigned long long)chunk_size,
+               (unsigned long long)ps->snap->chunk_size);
+        /* We had a bogus chunk_size. Fix stuff up. */
+        free_area(ps);
+        ps->snap->chunk_size = chunk_size;
+        ps->snap->chunk_mask = chunk_size - 1;
+        ps->snap->chunk_shift = ffs(chunk_size) - 1;
+        r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
+                                ps->io_client);
+        if (r)
+                return r;
+        r = alloc_area(ps);
+        return r;
+bad:
+        free_area(ps);
+        return r;
+}
+static int write_header(struct pstore *ps)
+{
+        struct disk_header *dh;
+        memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
+        dh = (struct disk_header *) ps->area;
+        dh->magic = cpu_to_le32(SNAP_MAGIC);
+        dh->valid = cpu_to_le32(ps->valid);
+        dh->version = cpu_to_le32(ps->version);
+        dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
+        return chunk_io(ps, 0, WRITE, 1);
+}
+/*
+ * Access functions for the disk exceptions, these do the endian conversions.
+ */
+static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
+{
+        BUG_ON(index >= ps->exceptions_per_area);
+        return ((struct disk_exception *) ps->area) + index;
+}
+static void read_exception(struct pstore *ps,
+                           uint32_t index, struct disk_exception *result)
+{
+        struct disk_exception *e = get_exception(ps, index);
+        /* copy it */
+        result->old_chunk = le64_to_cpu(e->old_chunk);
+        result->new_chunk = le64_to_cpu(e->new_chunk);
+}
+static void write_exception(struct pstore *ps,
+                            uint32_t index, struct disk_exception *de)
+{
+        struct disk_exception *e = get_exception(ps, index);
+        /* copy it */
+        e->old_chunk = cpu_to_le64(de->old_chunk);
+        e->new_chunk = cpu_to_le64(de->new_chunk);
+}
+/*
+ * Registers the exceptions that are present in the current area.
+ * 'full' is filled in to indicate if the area has been
+ * filled.
+ */
+static int insert_exceptions(struct pstore *ps,
+                             int (*callback)(void *callback_context,
+                                             chunk_t old, chunk_t new),
+                             void *callback_context,
+                             int *full)
+{
+        int r;
+        unsigned int i;
+        struct disk_exception de;
+        /* presume the area is full */
+        *full = 1;
+        for (i = 0; i < ps->exceptions_per_area; i++) {
+                read_exception(ps, i, &de);
+                /*
+                 * If the new_chunk is pointing at the start of
+                 * the COW device, where the first metadata area
+                 * is we know that we've hit the end of the
+                 * exceptions.  Therefore the area is not full.
+                 */
+                if (de.new_chunk == 0LL) {
+                        ps->current_committed = i;
+                        *full = 0;
+                        break;
+                }
+                /*
+                 * Keep track of the start of the free chunks.
+                 */
+                if (ps->next_free <= de.new_chunk)
+                        ps->next_free = de.new_chunk + 1;
+                /*
+                 * Otherwise we add the exception to the snapshot.
+                 */
+                r = callback(callback_context, de.old_chunk, de.new_chunk);
+                if (r)
+                        return r;
+        }
+        return 0;
+}
+static int read_exceptions(struct pstore *ps,
+                           int (*callback)(void *callback_context, chunk_t old,
+                                           chunk_t new),
+                           void *callback_context)
+{
+        int r, full = 1;
+        /*
+         * Keeping reading chunks and inserting exceptions until
+         * we find a partially full area.
+         */
+        for (ps->current_area = 0; full; ps->current_area++) {
+                r = area_io(ps, READ);
+                if (r)
+                        return r;
+                r = insert_exceptions(ps, callback, callback_context, &full);
+                if (r)
+                        return r;
+        }
+        ps->current_area--;
+        return 0;
+}
+static struct pstore *get_info(struct dm_exception_store *store)
+{
+        return (struct pstore *) store->context;
+}
+static void persistent_fraction_full(struct dm_exception_store *store,
+                                     sector_t *numerator, sector_t *denominator)
+{
+        *numerator = get_info(store)->next_free * store->snap->chunk_size;
+        *denominator = get_dev_size(store->snap->cow->bdev);
+}
+static void persistent_destroy(struct dm_exception_store *store)
+{
+        struct pstore *ps = get_info(store);
+        destroy_workqueue(ps->metadata_wq);
+        dm_io_client_destroy(ps->io_client);
+        vfree(ps->callbacks);
+        free_area(ps);
+        kfree(ps);
+}
+static int persistent_read_metadata(struct dm_exception_store *store,
+                                    int (*callback)(void *callback_context,
+                                                    chunk_t old, chunk_t new),
+                                    void *callback_context)
+{
+        int r, uninitialized_var(new_snapshot);
+        struct pstore *ps = get_info(store);
+        /*
+         * Read the snapshot header.
+         */
+        r = read_header(ps, &new_snapshot);
+        if (r)
+                return r;
+        /*
+         * Now we know correct chunk_size, complete the initialisation.
+         */
+        ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
+                                  sizeof(struct disk_exception);
+        ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
+                        sizeof(*ps->callbacks));
+        if (!ps->callbacks)
+                return -ENOMEM;
+        /*
+         * Do we need to setup a new snapshot ?
+         */
+        if (new_snapshot) {
+                r = write_header(ps);
+                if (r) {
+                        DMWARN("write_header failed");
+                        return r;
+                }
+                ps->current_area = 0;
+                zero_memory_area(ps);
+                r = zero_disk_area(ps, 0);
+                if (r) {
+                        DMWARN("zero_disk_area(0) failed");
+                        return r;
+                }
+        } else {
+                /*
+                 * Sanity checks.
+                 */
+                if (ps->version != SNAPSHOT_DISK_VERSION) {
+                        DMWARN("unable to handle snapshot disk version %d",
+                               ps->version);
+                        return -EINVAL;
+                }
+                /*
+                 * Metadata are valid, but snapshot is invalidated
+                 */
+                if (!ps->valid)
+                        return 1;
+                /*
+                 * Read the metadata.
+                 */
+                r = read_exceptions(ps, callback, callback_context);
+                if (r)
+                        return r;
+        }
+        return 0;
+}
+static int persistent_prepare_exception(struct dm_exception_store *store,
+                                        struct dm_snap_exception *e)
+{
+        struct pstore *ps = get_info(store);
+        uint32_t stride;
+        chunk_t next_free;
+        sector_t size = get_dev_size(store->snap->cow->bdev);
+        /* Is there enough room ? */
+        if (size < ((ps->next_free + 1) * store->snap->chunk_size))
+                return -ENOSPC;
+        e->new_chunk = ps->next_free;
+        /*
+         * Move onto the next free pending, making sure to take
+         * into account the location of the metadata chunks.
+         */
+        stride = (ps->exceptions_per_area + 1);
+        next_free = ++ps->next_free;
+        if (sector_div(next_free, stride) == 1)
+                ps->next_free++;
+        atomic_inc(&ps->pending_count);
+        return 0;
+}
+static void persistent_commit_exception(struct dm_exception_store *store,
+                                        struct dm_snap_exception *e,
+                                        void (*callback) (void *, int success),
+                                        void *callback_context)
+{
+        unsigned int i;
+        struct pstore *ps = get_info(store);
+        struct disk_exception de;
+        struct commit_callback *cb;
+        de.old_chunk = e->old_chunk;
+        de.new_chunk = e->new_chunk;
+        write_exception(ps, ps->current_committed++, &de);
+        /*
+         * Add the callback to the back of the array.  This code
+         * is the only place where the callback array is
+         * manipulated, and we know that it will never be called
+         * multiple times concurrently.
+         */
+        cb = ps->callbacks + ps->callback_count++;
+        cb->callback = callback;
+        cb->context = callback_context;
+        /*
+         * If there are exceptions in flight and we have not yet
+         * filled this metadata area there's nothing more to do.
+         */
+        if (!atomic_dec_and_test(&ps->pending_count) &&
+            (ps->current_committed != ps->exceptions_per_area))
+                return;
+        /*
+         * If we completely filled the current area, then wipe the next one.
+         */
+        if ((ps->current_committed == ps->exceptions_per_area) &&
+             zero_disk_area(ps, ps->current_area + 1))
+                ps->valid = 0;
+        /*
+         * Commit exceptions to disk.
+         */
+        if (ps->valid && area_io(ps, WRITE))
+                ps->valid = 0;
+        /*
+         * Advance to the next area if this one is full.
+         */
+        if (ps->current_committed == ps->exceptions_per_area) {
+                ps->current_committed = 0;
+                ps->current_area++;
+                zero_memory_area(ps);
+        }
+        for (i = 0; i < ps->callback_count; i++) {
+                cb = ps->callbacks + i;
+                cb->callback(cb->context, ps->valid);
+        }
+        ps->callback_count = 0;
+}
+static void persistent_drop_snapshot(struct dm_exception_store *store)
+{
+        struct pstore *ps = get_info(store);
+        ps->valid = 0;
+        if (write_header(ps))
+                DMWARN("write header failed");
+}
+int dm_create_persistent(struct dm_exception_store *store)
+{
+        struct pstore *ps;
+        /* allocate the pstore */
+        ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+        if (!ps)
+                return -ENOMEM;
+        ps->snap = store->snap;
+        ps->valid = 1;
+        ps->version = SNAPSHOT_DISK_VERSION;
+        ps->area = NULL;
+        ps->next_free = 2;      /* skipping the header and first area */
+        ps->current_committed = 0;
+        ps->callback_count = 0;
+        atomic_set(&ps->pending_count, 0);
+        ps->callbacks = NULL;
+        ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
+        if (!ps->metadata_wq) {
+                kfree(ps);
+                DMERR("couldn't start header metadata update thread");
+                return -ENOMEM;
+        }
+        store->destroy = persistent_destroy;
+        store->read_metadata = persistent_read_metadata;
+        store->prepare_exception = persistent_prepare_exception;
+        store->commit_exception = persistent_commit_exception;
+        store->drop_snapshot = persistent_drop_snapshot;
+        store->fraction_full = persistent_fraction_full;
+        store->context = ps;
+        return 0;
+}
+int dm_persistent_snapshot_init(void)
+{
+        return 0;
+}
+void dm_persistent_snapshot_exit(void)
+{
+}
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
new file mode 100644
index 000000000000..7f6e2e6dcb0d
--- /dev/null
+++ b/drivers/md/dm-snap-transient.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006-2008 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ */
+#include "dm-exception-store.h"
+#include "dm-snap.h"
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/dm-io.h>
+#define DM_MSG_PREFIX "transient snapshot"
+/*-----------------------------------------------------------------
+ * Implementation of the store for non-persistent snapshots.
+ *---------------------------------------------------------------*/
+struct transient_c {
+        sector_t next_free;
+};
+static void transient_destroy(struct dm_exception_store *store)
+{
+        kfree(store->context);
+}
+static int transient_read_metadata(struct dm_exception_store *store,
+                                   int (*callback)(void *callback_context,
+                                                   chunk_t old, chunk_t new),
+                                   void *callback_context)
+{
+        return 0;
+}
+static int transient_prepare_exception(struct dm_exception_store *store,
+                                       struct dm_snap_exception *e)
+{
+        struct transient_c *tc = (struct transient_c *) store->context;
+        sector_t size = get_dev_size(store->snap->cow->bdev);
+        if (size < (tc->next_free + store->snap->chunk_size))
+                return -1;
+        e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
+        tc->next_free += store->snap->chunk_size;
+        return 0;
+}
+static void transient_commit_exception(struct dm_exception_store *store,
+                                       struct dm_snap_exception *e,
+                                       void (*callback) (void *, int success),
+                                       void *callback_context)
+{
+        /* Just succeed */
+        callback(callback_context, 1);
+}
+static void transient_fraction_full(struct dm_exception_store *store,
+                                    sector_t *numerator, sector_t *denominator)
+{
+        *numerator = ((struct transient_c *) store->context)->next_free;
+        *denominator = get_dev_size(store->snap->cow->bdev);
+}
+int dm_create_transient(struct dm_exception_store *store)
+{
+        struct transient_c *tc;
+        store->destroy = transient_destroy;
+        store->read_metadata = transient_read_metadata;
+        store->prepare_exception = transient_prepare_exception;
+        store->commit_exception = transient_commit_exception;
+        store->drop_snapshot = NULL;
+        store->fraction_full = transient_fraction_full;
+        tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
+        if (!tc)
+                return -ENOMEM;
+        tc->next_free = 0;
+        store->context = tc;
+        return 0;
+}
+int dm_transient_snapshot_init(void)
+{
+        return 0;
+}
+void dm_transient_snapshot_exit(void)
+{
+}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 6c96db26b87c..65ff82ff124e 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -9,6 +9,7 @@
 #include <linux/blkdev.h>
 #include <linux/ctype.h>
 #include <linux/device-mapper.h>
+#include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kdev_t.h>
@@ -20,6 +21,7 @@
 #include <linux/log2.h>
 #include <linux/dm-kcopyd.h>
+#include "dm-exception-store.h"
 #include "dm-snap.h"
 #include "dm-bio-list.h"
@@ -428,8 +430,13 @@ out:
        list_add(&new_e->hash_list, e ? &e->hash_list : l);
 }
-int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
+/*
+ * Callback used by the exception stores to load exceptions when
+ * initialising.
+ */
+static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 {
+        struct dm_snapshot *s = context;
        struct dm_snap_exception *e;
        e = alloc_exception();
@@ -658,7 +665,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        spin_lock_init(&s->tracked_chunk_lock);
        /* Metadata must only be loaded into one table at once */
-        r = s->store.read_metadata(&s->store);
+        r = s->store.read_metadata(&s->store, dm_add_exception, (void *)s);
        if (r < 0) {
                ti->error = "Failed to read snapshot metadata";
                goto bad_load_and_register;
@@ -735,7 +742,7 @@ static void snapshot_dtr(struct dm_target *ti)
        unregister_snapshot(s);
        while (atomic_read(&s->pending_exceptions_count))
-                yield();
+                msleep(1);
        /*
         * Ensure instructions in mempool_destroy aren't reordered
         * before atomic_read.
@@ -888,10 +895,10 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
        /*
         * Check for conflicting reads. This is extremely improbable,
-         * so yield() is sufficient and there is no need for a wait queue.
+         * so msleep(1) is sufficient and there is no need for a wait queue.
         */
        while (__chunk_is_tracked(s, pe->e.old_chunk))
-                yield();
+                msleep(1);
        /*
         * Add a proper exception, and remove the
@@ -1404,6 +1411,12 @@ static int __init dm_snapshot_init(void)
 {
        int r;
+        r = dm_exception_store_init();
+        if (r) {
+                DMERR("Failed to initialize exception stores");
+                return r;
+        }
        r = dm_register_target(&snapshot_target);
        if (r) {
                DMERR("snapshot target register failed %d", r);
@@ -1452,39 +1465,34 @@ static int __init dm_snapshot_init(void)
        return 0;
-      bad_pending_pool:
+bad_pending_pool:
        kmem_cache_destroy(tracked_chunk_cache);
-      bad5:
+bad5:
        kmem_cache_destroy(pending_cache);
-      bad4:
+bad4:
        kmem_cache_destroy(exception_cache);
-      bad3:
+bad3:
        exit_origin_hash();
-      bad2:
+bad2:
        dm_unregister_target(&origin_target);
-      bad1:
+bad1:
        dm_unregister_target(&snapshot_target);
        return r;
 }
 static void __exit dm_snapshot_exit(void)
 {
-        int r;
        destroy_workqueue(ksnapd);
-        r = dm_unregister_target(&snapshot_target);
+        dm_unregister_target(&snapshot_target);
-        if (r)
+        dm_unregister_target(&origin_target);
-                DMERR("snapshot unregister failed %d", r);
-        r = dm_unregister_target(&origin_target);
-        if (r)
-                DMERR("origin unregister failed %d", r);
        exit_origin_hash();
        kmem_cache_destroy(pending_cache);
        kmem_cache_destroy(exception_cache);
        kmem_cache_destroy(tracked_chunk_cache);
+        dm_exception_store_exit();
 }
 /* Module hooks */
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index 99c0106ede2d..d9e62b43cf85 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -1,6 +1,4 @@
 /*
- * dm-snapshot.c
- *
 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
 *
 * This file is released under the GPL.
@@ -10,6 +8,7 @@
 #define DM_SNAPSHOT_H
 #include <linux/device-mapper.h>
+#include "dm-exception-store.h"
 #include "dm-bio-list.h"
 #include <linux/blkdev.h>
 #include <linux/workqueue.h>
@@ -20,116 +19,6 @@ struct exception_table {
        struct list_head *table;
 };
-/*
- * The snapshot code deals with largish chunks of the disk at a
- * time. Typically 32k - 512k.
- */
-typedef sector_t chunk_t;
-/*
- * An exception is used where an old chunk of data has been
- * replaced by a new one.
- * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
- * of chunks that follow contiguously.  Remaining bits hold the number of the
- * chunk within the device.
- */
-struct dm_snap_exception {
-        struct list_head hash_list;
-        chunk_t old_chunk;
-        chunk_t new_chunk;
-};
-/*
- * Funtions to manipulate consecutive chunks
- */
-#  if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
-#    define DM_CHUNK_CONSECUTIVE_BITS 8
-#    define DM_CHUNK_NUMBER_BITS 56
-static inline chunk_t dm_chunk_number(chunk_t chunk)
-{
-        return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
-}
-static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
-{
-        return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
-}
-static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
-{
-        e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
-        BUG_ON(!dm_consecutive_chunk_count(e));
-}
-#  else
-#    define DM_CHUNK_CONSECUTIVE_BITS 0
-static inline chunk_t dm_chunk_number(chunk_t chunk)
-{
-        return chunk;
-}
-static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
-{
-        return 0;
-}
-static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
-{
-}
-#  endif
-/*
- * Abstraction to handle the meta/layout of exception stores (the
- * COW device).
- */
-struct exception_store {
-        /*
-         * Destroys this object when you've finished with it.
-         */
-        void (*destroy) (struct exception_store *store);
-        /*
-         * The target shouldn't read the COW device until this is
-         * called.
-         */
-        int (*read_metadata) (struct exception_store *store);
-        /*
-         * Find somewhere to store the next exception.
-         */
-        int (*prepare_exception) (struct exception_store *store,
-                                  struct dm_snap_exception *e);
-        /*
-         * Update the metadata with this exception.
-         */
-        void (*commit_exception) (struct exception_store *store,
-                                  struct dm_snap_exception *e,
-                                  void (*callback) (void *, int success),
-                                  void *callback_context);
-        /*
-         * The snapshot is invalid, note this in the metadata.
-         */
-        void (*drop_snapshot) (struct exception_store *store);
-        /*
-         * Return how full the snapshot is.
-         */
-        void (*fraction_full) (struct exception_store *store,
-                               sector_t *numerator,
-                               sector_t *denominator);
-        struct dm_snapshot *snap;
-        void *context;
-};
 #define DM_TRACKED_CHUNK_HASH_SIZE      16
 #define DM_TRACKED_CHUNK_HASH(x)        ((unsigned long)(x) & \
                                         (DM_TRACKED_CHUNK_HASH_SIZE - 1))
@@ -172,7 +61,7 @@ struct dm_snapshot {
        spinlock_t pe_lock;
        /* The on disk metadata handler */
-        struct exception_store store;
+        struct dm_exception_store store;
        struct dm_kcopyd_client *kcopyd_client;
@@ -187,20 +76,6 @@ struct dm_snapshot {
 };
 /*
- * Used by the exception stores to load exceptions hen
- * initialising.
- */
-int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
-/*
- * Constructor and destructor for the default persistent
- * store.
- */
-int dm_create_persistent(struct exception_store *store);
-int dm_create_transient(struct exception_store *store);
-/*
 * Return the number of sectors in the device.
 */
 static inline sector_t get_dev_size(struct block_device *bdev)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 9e4ef88d421e..41569bc60abc 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -337,9 +337,7 @@ int __init dm_stripe_init(void)
 void dm_stripe_exit(void)
 {
-        if (dm_unregister_target(&stripe_target))
+        dm_unregister_target(&stripe_target);
-                DMWARN("target unregistration failed");
        destroy_workqueue(kstriped);
        return;
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
new file mode 100644
index 000000000000..a2a45e6c7c8b
--- /dev/null
+++ b/drivers/md/dm-sysfs.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/sysfs.h>
+#include <linux/dm-ioctl.h>
+#include "dm.h"
+struct dm_sysfs_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct mapped_device *, char *);
+        ssize_t (*store)(struct mapped_device *, char *);
+};
+#define DM_ATTR_RO(_name) \
+struct dm_sysfs_attr dm_attr_##_name = \
+        __ATTR(_name, S_IRUGO, dm_attr_##_name##_show, NULL)
+static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
+                            char *page)
+{
+        struct dm_sysfs_attr *dm_attr;
+        struct mapped_device *md;
+        ssize_t ret;
+        dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
+        if (!dm_attr->show)
+                return -EIO;
+        md = dm_get_from_kobject(kobj);
+        if (!md)
+                return -EINVAL;
+        ret = dm_attr->show(md, page);
+        dm_put(md);
+        return ret;
+}
+static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
+{
+        if (dm_copy_name_and_uuid(md, buf, NULL))
+                return -EIO;
+        strcat(buf, "\n");
+        return strlen(buf);
+}
+static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
+{
+        if (dm_copy_name_and_uuid(md, NULL, buf))
+                return -EIO;
+        strcat(buf, "\n");
+        return strlen(buf);
+}
+static DM_ATTR_RO(name);
+static DM_ATTR_RO(uuid);
+static struct attribute *dm_attrs[] = {
+        &dm_attr_name.attr,
+        &dm_attr_uuid.attr,
+        NULL,
+};
+static struct sysfs_ops dm_sysfs_ops = {
+        .show   = dm_attr_show,
+};
+/*
+ * dm kobject is embedded in mapped_device structure
+ * no need to define release function here
+ */
+static struct kobj_type dm_ktype = {
+        .sysfs_ops      = &dm_sysfs_ops,
+        .default_attrs  = dm_attrs,
+};
+/*
+ * Initialize kobj
+ * because nobody using md yet, no need to call explicit dm_get/put
+ */
+int dm_sysfs_init(struct mapped_device *md)
+{
+        return kobject_init_and_add(dm_kobject(md), &dm_ktype,
+                                    &disk_to_dev(dm_disk(md))->kobj,
+                                    "%s", "dm");
+}
+/*
+ * Remove kobj, called after all references removed
+ */
+void dm_sysfs_exit(struct mapped_device *md)
+{
+        kobject_put(dm_kobject(md));
+}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 04e5fd742c2c..2fd66c30f7f8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) 2001 Sistina Software (UK) Limited.
- * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
 *
 * This file is released under the GPL.
 */
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/mutex.h>
+#include <linux/delay.h>
 #include <asm/atomic.h>
 #define DM_MSG_PREFIX "table"
@@ -24,6 +25,19 @@
 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
+/*
+ * The table has always exactly one reference from either mapped_device->map
+ * or hash_cell->new_map. This reference is not counted in table->holders.
+ * A pair of dm_create_table/dm_destroy_table functions is used for table
+ * creation/destruction.
+ *
+ * Temporary references from the other code increase table->holders. A pair
+ * of dm_table_get/dm_table_put functions is used to manipulate it.
+ *
+ * When the table is about to be destroyed, we wait for table->holders to
+ * drop to zero.
+ */
 struct dm_table {
        struct mapped_device *md;
        atomic_t holders;
@@ -38,6 +52,8 @@ struct dm_table {
        sector_t *highs;
        struct dm_target *targets;
+        unsigned barriers_supported:1;
        /*
         * Indicates the rw permissions for the new logical
         * device.  This should be a combination of FMODE_READ
@@ -226,7 +242,8 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
                return -ENOMEM;
        INIT_LIST_HEAD(&t->devices);
-        atomic_set(&t->holders, 1);
+        atomic_set(&t->holders, 0);
+        t->barriers_supported = 1;
        if (!num_targets)
                num_targets = KEYS_PER_NODE;
@@ -256,10 +273,14 @@ static void free_devices(struct list_head *devices)
        }
 }
-static void table_destroy(struct dm_table *t)
+void dm_table_destroy(struct dm_table *t)
 {
        unsigned int i;
+        while (atomic_read(&t->holders))
+                msleep(1);
+        smp_mb();
        /* free the indexes (see dm_table_complete) */
        if (t->depth >= 2)
                vfree(t->index[t->depth - 2]);
@@ -297,8 +318,8 @@ void dm_table_put(struct dm_table *t)
        if (!t)
                return;
-        if (atomic_dec_and_test(&t->holders))
+        smp_mb__before_atomic_dec();
-                table_destroy(t);
+        atomic_dec(&t->holders);
 }
 /*
@@ -728,6 +749,10 @@ int dm_table_add_target(struct dm_table *t, const char *type,
        /* FIXME: the plan is to combine high here and then have
         * the merge fn apply the target level restrictions. */
        combine_restrictions_low(&t->limits, &tgt->limits);
+        if (!(tgt->type->features & DM_TARGET_SUPPORTS_BARRIERS))
+                t->barriers_supported = 0;
        return 0;
 bad:
@@ -772,6 +797,12 @@ int dm_table_complete(struct dm_table *t)
        check_for_valid_limits(&t->limits);
+        /*
+         * We only support barriers if there is exactly one underlying device.
+         */
+        if (!list_is_singular(&t->devices))
+                t->barriers_supported = 0;
        /* how many indexes will the btree have ? */
        leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
        t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -986,6 +1017,12 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
        return t->md;
 }
+int dm_table_barrier_ok(struct dm_table *t)
+{
+        return t->barriers_supported;
+}
+EXPORT_SYMBOL(dm_table_barrier_ok);
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 835cf95b857f..7decf10006e4 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -130,26 +130,26 @@ int dm_register_target(struct target_type *t)
        return rv;
 }
-int dm_unregister_target(struct target_type *t)
+void dm_unregister_target(struct target_type *t)
 {
        struct tt_internal *ti;
        down_write(&_lock);
        if (!(ti = __find_target_type(t->name))) {
-                up_write(&_lock);
+                DMCRIT("Unregistering unrecognised target: %s", t->name);
-                return -EINVAL;
+                BUG();
        }
        if (ti->use) {
-                up_write(&_lock);
+                DMCRIT("Attempt to unregister target still in use: %s",
-                return -ETXTBSY;
+                       t->name);
+                BUG();
        }
        list_del(&ti->list);
        kfree(ti);
        up_write(&_lock);
-        return 0;
 }
 /*
@@ -187,8 +187,7 @@ int __init dm_target_init(void)
 void dm_target_exit(void)
 {
-        if (dm_unregister_target(&error_target))
+        dm_unregister_target(&error_target);
-                DMWARN("error target unregistration failed");
 }
 EXPORT_SYMBOL(dm_register_target);
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index cdbf126ec106..bbc97030c0c2 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -69,10 +69,7 @@ static int __init dm_zero_init(void)
 static void __exit dm_zero_exit(void)
 {
-        int r = dm_unregister_target(&zero_target);
+        dm_unregister_target(&zero_target);
-        if (r < 0)
-                DMERR("unregister failed %d", r);
 }
 module_init(dm_zero_init)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c99e4728ff41..51ba1db4b3e7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
 *
 * This file is released under the GPL.
 */
@@ -21,6 +21,7 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #define DM_MSG_PREFIX "core"
@@ -31,6 +32,7 @@ static unsigned int _major = 0;
 static DEFINE_SPINLOCK(_minor_lock);
 /*
+ * For bio-based dm.
 * One of these is allocated per bio.
 */
 struct dm_io {
@@ -42,6 +44,7 @@ struct dm_io {
 };
 /*
+ * For bio-based dm.
 * One of these is allocated per target within a bio.  Hopefully
 * this will be simplified out one day.
 */
@@ -51,6 +54,29 @@ struct dm_target_io {
        union map_info info;
 };
+DEFINE_TRACE(block_bio_complete);
+/*
+ * For request-based dm.
+ * One of these is allocated per request.
+ */
+struct dm_rq_target_io {
+        struct mapped_device *md;
+        struct dm_target *ti;
+        struct request *orig, clone;
+        int error;
+        union map_info info;
+};
+/*
+ * For request-based dm.
+ * One of these is allocated per bio.
+ */
+struct dm_rq_clone_bio_info {
+        struct bio *orig;
+        struct request *rq;
+};
 union map_info *dm_get_mapinfo(struct bio *bio)
 {
        if (bio && bio->bi_private)
@@ -141,11 +167,16 @@ struct mapped_device {
        /* forced geometry settings */
        struct hd_geometry geometry;
+        /* sysfs handle */
+        struct kobject kobj;
 };
 #define MIN_IOS 256
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_tio_cache;
+static struct kmem_cache *_rq_tio_cache;
+static struct kmem_cache *_rq_bio_info_cache;
 static int __init local_init(void)
 {
@@ -161,9 +192,17 @@ static int __init local_init(void)
        if (!_tio_cache)
                goto out_free_io_cache;
+        _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
+        if (!_rq_tio_cache)
+                goto out_free_tio_cache;
+        _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
+        if (!_rq_bio_info_cache)
+                goto out_free_rq_tio_cache;
        r = dm_uevent_init();
        if (r)
-                goto out_free_tio_cache;
+                goto out_free_rq_bio_info_cache;
        _major = major;
        r = register_blkdev(_major, _name);
@@ -177,6 +216,10 @@ static int __init local_init(void)
 out_uevent_exit:
        dm_uevent_exit();
+out_free_rq_bio_info_cache:
+        kmem_cache_destroy(_rq_bio_info_cache);
+out_free_rq_tio_cache:
+        kmem_cache_destroy(_rq_tio_cache);
 out_free_tio_cache:
        kmem_cache_destroy(_tio_cache);
 out_free_io_cache:
@@ -187,6 +230,8 @@ out_free_io_cache:
 static void local_exit(void)
 {
+        kmem_cache_destroy(_rq_bio_info_cache);
+        kmem_cache_destroy(_rq_tio_cache);
        kmem_cache_destroy(_tio_cache);
        kmem_cache_destroy(_io_cache);
        unregister_blkdev(_major, _name);
@@ -504,8 +549,7 @@ static void dec_pending(struct dm_io *io, int error)
                end_io_acct(io);
                if (io->error != DM_ENDIO_REQUEUE) {
-                        blk_add_trace_bio(io->md->queue, io->bio,
+                        trace_block_bio_complete(io->md->queue, io->bio);
-                                          BLK_TA_COMPLETE);
                        bio_endio(io->bio, io->error);
                }
@@ -598,7 +642,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
        if (r == DM_MAPIO_REMAPPED) {
                /* the bio has been remapped so dispatch it */
-                blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
+                trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
                                    tio->io->bio->bi_bdev->bd_dev,
                                    clone->bi_sector, sector);
@@ -794,7 +838,11 @@ static int __split_bio(struct mapped_device *md, struct bio *bio)
        ci.map = dm_get_table(md);
        if (unlikely(!ci.map))
                return -EIO;
+        if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) {
+                dm_table_put(ci.map);
+                bio_endio(bio, -EOPNOTSUPP);
+                return 0;
+        }
        ci.md = md;
        ci.bio = bio;
        ci.io = alloc_io(md);
@@ -878,15 +926,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
        struct mapped_device *md = q->queuedata;
        int cpu;
-        /*
-         * There is no use in forwarding any barrier request since we can't
-         * guarantee it is (or can be) handled by the targets correctly.
-         */
-        if (unlikely(bio_barrier(bio))) {
-                bio_endio(bio, -EOPNOTSUPP);
-                return 0;
-        }
        down_read(&md->io_lock);
        cpu = part_stat_lock();
@@ -941,8 +980,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
        struct mapped_device *md = congested_data;
        struct dm_table *map;
-        atomic_inc(&md->pending);
        if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
                map = dm_get_table(md);
                if (map) {
@@ -951,10 +988,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
                }
        }
-        if (!atomic_dec_return(&md->pending))
-                /* nudge anyone waiting on suspend queue */
-                wake_up(&md->wait);
        return r;
 }
@@ -1091,7 +1124,7 @@ static struct mapped_device *alloc_dev(int minor)
        if (!md->tio_pool)
                goto bad_tio_pool;
-        md->bs = bioset_create(16, 16);
+        md->bs = bioset_create(16, 0);
        if (!md->bs)
                goto bad_no_bioset;
@@ -1214,10 +1247,12 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
        if (md->suspended_bdev)
                __set_size(md, size);
-        if (size == 0)
+        if (!size) {
+                dm_table_destroy(t);
                return 0;
+        }
-        dm_table_get(t);
        dm_table_event_callback(t, event_callback, md);
        write_lock(&md->map_lock);
@@ -1239,7 +1274,7 @@ static void __unbind(struct mapped_device *md)
        write_lock(&md->map_lock);
        md->map = NULL;
        write_unlock(&md->map_lock);
-        dm_table_put(map);
+        dm_table_destroy(map);
 }
 /*
@@ -1253,6 +1288,8 @@ int dm_create(int minor, struct mapped_device **result)
        if (!md)
                return -ENXIO;
+        dm_sysfs_init(md);
        *result = md;
        return 0;
 }
@@ -1328,8 +1365,9 @@ void dm_put(struct mapped_device *md)
                        dm_table_presuspend_targets(map);
                        dm_table_postsuspend_targets(map);
                }
-                __unbind(md);
+                dm_sysfs_exit(md);
                dm_table_put(map);
+                __unbind(md);
                free_dev(md);
        }
 }
@@ -1667,6 +1705,27 @@ struct gendisk *dm_disk(struct mapped_device *md)
        return md->disk;
 }
+struct kobject *dm_kobject(struct mapped_device *md)
+{
+        return &md->kobj;
+}
+/*
+ * struct mapped_device should not be exported outside of dm.c
+ * so use this check to verify that kobj is part of md structure
+ */
+struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
+{
+        struct mapped_device *md;
+        md = container_of(kobj, struct mapped_device, kobj);
+        if (&md->kobj != kobj)
+                return NULL;
+        dm_get(md);
+        return md;
+}
 int dm_suspended(struct mapped_device *md)
 {
        return test_bit(DMF_SUSPENDED, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0ade60cdef42..20194e000c5a 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -36,6 +36,7 @@ struct dm_table;
 /*-----------------------------------------------------------------
 * Internal table functions.
 *---------------------------------------------------------------*/
+void dm_table_destroy(struct dm_table *t);
 void dm_table_event_callback(struct dm_table *t,
                             void (*fn)(void *), void *context);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
@@ -51,6 +52,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 * To check the return value from dm_table_find_target().
 */
 #define dm_target_is_valid(t) ((t)->table)
+int dm_table_barrier_ok(struct dm_table *t);
 /*-----------------------------------------------------------------
 * A registry of target types.
@@ -72,6 +74,14 @@ int dm_interface_init(void);
 void dm_interface_exit(void);
 /*
+ * sysfs interface
+ */
+int dm_sysfs_init(struct mapped_device *md);
+void dm_sysfs_exit(struct mapped_device *md);
+struct kobject *dm_kobject(struct mapped_device *md);
+struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
+/*
 * Targets for linear and striped mappings
 */
 int dm_linear_init(void);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index f26c1f9a475b..86d9adf90e79 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -283,7 +283,6 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
 static int run(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
-        struct list_head *tmp;
        int i;
        conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL);
@@ -296,7 +295,7 @@ static int run(mddev_t *mddev)
        }
        conf->nfaults = 0;
-        rdev_for_each(rdev, tmp, mddev)
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                conf->rdev = rdev;
        mddev->array_sectors = mddev->size * 2;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3b90c5c924ec..1e3aea9eecf1 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -105,7 +105,6 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
        int i, nb_zone, cnt;
        sector_t min_sectors;
        sector_t curr_sector;
-        struct list_head *tmp;
        conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
                        GFP_KERNEL);
@@ -115,7 +114,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
        cnt = 0;
        conf->array_sectors = 0;
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                int j = rdev->raid_disk;
                dev_info_t *disk = conf->disks + j;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1b1d32694f6f..41e2509bf896 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -214,20 +214,33 @@ static inline mddev_t *mddev_get(mddev_t *mddev)
        return mddev;
 }
+static void mddev_delayed_delete(struct work_struct *ws)
+{
+        mddev_t *mddev = container_of(ws, mddev_t, del_work);
+        kobject_del(&mddev->kobj);
+        kobject_put(&mddev->kobj);
+}
 static void mddev_put(mddev_t *mddev)
 {
        if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
                return;
-        if (!mddev->raid_disks && list_empty(&mddev->disks)) {
+        if (!mddev->raid_disks && list_empty(&mddev->disks) &&
+            !mddev->hold_active) {
                list_del(&mddev->all_mddevs);
-                spin_unlock(&all_mddevs_lock);
+                if (mddev->gendisk) {
-                blk_cleanup_queue(mddev->queue);
+                        /* we did a probe so need to clean up.
-                if (mddev->sysfs_state)
+                         * Call schedule_work inside the spinlock
-                        sysfs_put(mddev->sysfs_state);
+                         * so that flush_scheduled_work() after
-                mddev->sysfs_state = NULL;
+                         * mddev_find will succeed in waiting for the
-                kobject_put(&mddev->kobj);
+                         * work to be done.
-        } else
+                         */
-                spin_unlock(&all_mddevs_lock);
+                        INIT_WORK(&mddev->del_work, mddev_delayed_delete);
+                        schedule_work(&mddev->del_work);
+                } else
+                        kfree(mddev);
+        }
+        spin_unlock(&all_mddevs_lock);
 }
 static mddev_t * mddev_find(dev_t unit)
@@ -236,15 +249,50 @@ static mddev_t * mddev_find(dev_t unit)
 retry:
        spin_lock(&all_mddevs_lock);
-        list_for_each_entry(mddev, &all_mddevs, all_mddevs)
-                if (mddev->unit == unit) {
+        if (unit) {
-                        mddev_get(mddev);
+                list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+                        if (mddev->unit == unit) {
+                                mddev_get(mddev);
+                                spin_unlock(&all_mddevs_lock);
+                                kfree(new);
+                                return mddev;
+                        }
+                if (new) {
+                        list_add(&new->all_mddevs, &all_mddevs);
                        spin_unlock(&all_mddevs_lock);
-                        kfree(new);
+                        new->hold_active = UNTIL_IOCTL;
-                        return mddev;
+                        return new;
                }
+        } else if (new) {
-        if (new) {
+                /* find an unused unit number */
+                static int next_minor = 512;
+                int start = next_minor;
+                int is_free = 0;
+                int dev = 0;
+                while (!is_free) {
+                        dev = MKDEV(MD_MAJOR, next_minor);
+                        next_minor++;
+                        if (next_minor > MINORMASK)
+                                next_minor = 0;
+                        if (next_minor == start) {
+                                /* Oh dear, all in use. */
+                                spin_unlock(&all_mddevs_lock);
+                                kfree(new);
+                                return NULL;
+                        }
+                                
+                        is_free = 1;
+                        list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+                                if (mddev->unit == dev) {
+                                        is_free = 0;
+                                        break;
+                                }
+                }
+                new->unit = dev;
+                new->md_minor = MINOR(dev);
+                new->hold_active = UNTIL_STOP;
                list_add(&new->all_mddevs, &all_mddevs);
                spin_unlock(&all_mddevs_lock);
                return new;
@@ -275,16 +323,6 @@ static mddev_t * mddev_find(dev_t unit)
        new->resync_max = MaxSector;
        new->level = LEVEL_NONE;
-        new->queue = blk_alloc_queue(GFP_KERNEL);
-        if (!new->queue) {
-                kfree(new);
-                return NULL;
-        }
-        /* Can be unlocked because the queue is new: no concurrency */
-        queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, new->queue);
-        blk_queue_make_request(new->queue, md_fail_request);
        goto retry;
 }
@@ -307,25 +345,23 @@ static inline void mddev_unlock(mddev_t * mddev)
 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
 {
-        mdk_rdev_t * rdev;
+        mdk_rdev_t *rdev;
-        struct list_head *tmp;
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->desc_nr == nr)
                        return rdev;
-        }
        return NULL;
 }
 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
 {
-        struct list_head *tmp;
        mdk_rdev_t *rdev;
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->bdev->bd_dev == dev)
                        return rdev;
-        }
        return NULL;
 }
@@ -861,7 +897,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        mdp_super_t *sb;
-        struct list_head *tmp;
        mdk_rdev_t *rdev2;
        int next_spare = mddev->raid_disks;
@@ -933,7 +968,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
        sb->disks[0].state = (1<<MD_DISK_REMOVED);
-        rdev_for_each(rdev2, tmp, mddev) {
+        list_for_each_entry(rdev2, &mddev->disks, same_set) {
                mdp_disk_t *d;
                int desc_nr;
                if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
@@ -1259,7 +1294,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        struct mdp_superblock_1 *sb;
-        struct list_head *tmp;
        mdk_rdev_t *rdev2;
        int max_dev, i;
        /* make rdev->sb match mddev and rdev data. */
@@ -1307,7 +1341,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        }
        max_dev = 0;
-        rdev_for_each(rdev2, tmp, mddev)
+        list_for_each_entry(rdev2, &mddev->disks, same_set)
                if (rdev2->desc_nr+1 > max_dev)
                        max_dev = rdev2->desc_nr+1;
@@ -1316,7 +1350,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        for (i=0; i<max_dev;i++)
                sb->dev_roles[i] = cpu_to_le16(0xfffe);
        
-        rdev_for_each(rdev2, tmp, mddev) {
+        list_for_each_entry(rdev2, &mddev->disks, same_set) {
                i = rdev2->desc_nr;
                if (test_bit(Faulty, &rdev2->flags))
                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1466,6 +1500,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
        list_add_rcu(&rdev->same_set, &mddev->disks);
        bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
+        /* May as well allow recovery to be retried once */
+        mddev->recovery_disabled = 0;
        return 0;
 fail:
@@ -1571,8 +1608,7 @@ static void kick_rdev_from_array(mdk_rdev_t * rdev)
 static void export_array(mddev_t *mddev)
 {
-        struct list_head *tmp;
+        mdk_rdev_t *rdev, *tmp;
-        mdk_rdev_t *rdev;
        rdev_for_each(rdev, tmp, mddev) {
                if (!rdev->mddev) {
@@ -1593,7 +1629,7 @@ static void print_desc(mdp_disk_t *desc)
                desc->major,desc->minor,desc->raid_disk,desc->state);
 }
-static void print_sb(mdp_super_t *sb)
+static void print_sb_90(mdp_super_t *sb)
 {
        int i;
@@ -1624,10 +1660,57 @@ static void print_sb(mdp_super_t *sb)
        }
        printk(KERN_INFO "md:     THIS: ");
        print_desc(&sb->this_disk);
 }
-static void print_rdev(mdk_rdev_t *rdev)
+static void print_sb_1(struct mdp_superblock_1 *sb)
+{
+        __u8 *uuid;
+        uuid = sb->set_uuid;
+        printk(KERN_INFO "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
+                        ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
+               KERN_INFO "md:    Name: \"%s\" CT:%llu\n",
+                le32_to_cpu(sb->major_version),
+                le32_to_cpu(sb->feature_map),
+                uuid[0], uuid[1], uuid[2], uuid[3],
+                uuid[4], uuid[5], uuid[6], uuid[7],
+                uuid[8], uuid[9], uuid[10], uuid[11],
+                uuid[12], uuid[13], uuid[14], uuid[15],
+                sb->set_name,
+                (unsigned long long)le64_to_cpu(sb->ctime)
+                       & MD_SUPERBLOCK_1_TIME_SEC_MASK);
+        uuid = sb->device_uuid;
+        printk(KERN_INFO "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
+                        " RO:%llu\n"
+               KERN_INFO "md:     Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
+                        ":%02x%02x%02x%02x%02x%02x\n"
+               KERN_INFO "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
+               KERN_INFO "md:         (MaxDev:%u) \n",
+                le32_to_cpu(sb->level),
+                (unsigned long long)le64_to_cpu(sb->size),
+                le32_to_cpu(sb->raid_disks),
+                le32_to_cpu(sb->layout),
+                le32_to_cpu(sb->chunksize),
+                (unsigned long long)le64_to_cpu(sb->data_offset),
+                (unsigned long long)le64_to_cpu(sb->data_size),
+                (unsigned long long)le64_to_cpu(sb->super_offset),
+                (unsigned long long)le64_to_cpu(sb->recovery_offset),
+                le32_to_cpu(sb->dev_number),
+                uuid[0], uuid[1], uuid[2], uuid[3],
+                uuid[4], uuid[5], uuid[6], uuid[7],
+                uuid[8], uuid[9], uuid[10], uuid[11],
+                uuid[12], uuid[13], uuid[14], uuid[15],
+                sb->devflags,
+                (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
+                (unsigned long long)le64_to_cpu(sb->events),
+                (unsigned long long)le64_to_cpu(sb->resync_offset),
+                le32_to_cpu(sb->sb_csum),
+                le32_to_cpu(sb->max_dev)
+                );
+}
+static void print_rdev(mdk_rdev_t *rdev, int major_version)
 {
        char b[BDEVNAME_SIZE];
        printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
@@ -1635,15 +1718,22 @@ static void print_rdev(mdk_rdev_t *rdev)
                test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
                rdev->desc_nr);
        if (rdev->sb_loaded) {
-                printk(KERN_INFO "md: rdev superblock:\n");
+                printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
-                print_sb((mdp_super_t*)page_address(rdev->sb_page));
+                switch (major_version) {
+                case 0:
+                        print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
+                        break;
+                case 1:
+                        print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
+                        break;
+                }
        } else
                printk(KERN_INFO "md: no rdev superblock!\n");
 }
 static void md_print_devices(void)
 {
-        struct list_head *tmp, *tmp2;
+        struct list_head *tmp;
        mdk_rdev_t *rdev;
        mddev_t *mddev;
        char b[BDEVNAME_SIZE];
@@ -1658,12 +1748,12 @@ static void md_print_devices(void)
                        bitmap_print_sb(mddev->bitmap);
                else
                        printk("%s: ", mdname(mddev));
-                rdev_for_each(rdev, tmp2, mddev)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
                        printk("<%s>", bdevname(rdev->bdev,b));
                printk("\n");
-                rdev_for_each(rdev, tmp2, mddev)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
-                        print_rdev(rdev);
+                        print_rdev(rdev, mddev->major_version);
        }
        printk("md:     **********************************\n");
        printk("\n");
@@ -1679,9 +1769,8 @@ static void sync_sbs(mddev_t * mddev, int nospares)
         * with the rest of the array)
         */
        mdk_rdev_t *rdev;
-        struct list_head *tmp;
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (rdev->sb_events == mddev->events ||
                    (nospares &&
                     rdev->raid_disk < 0 &&
@@ -1699,7 +1788,6 @@ static void sync_sbs(mddev_t * mddev, int nospares)
 static void md_update_sb(mddev_t * mddev, int force_change)
 {
-        struct list_head *tmp;
        mdk_rdev_t *rdev;
        int sync_req;
        int nospares = 0;
@@ -1790,7 +1878,7 @@ repeat:
                mdname(mddev),mddev->in_sync);
        bitmap_update_sb(mddev->bitmap);
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                char b[BDEVNAME_SIZE];
                dprintk(KERN_INFO "md: ");
                if (rdev->sb_loaded != 1)
@@ -1999,7 +2087,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                md_wakeup_thread(rdev->mddev->thread);
        } else if (rdev->mddev->pers) {
                mdk_rdev_t *rdev2;
-                struct list_head *tmp;
                /* Activating a spare .. or possibly reactivating
                 * if we every get bitmaps working here.
                 */
@@ -2010,7 +2097,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                if (rdev->mddev->pers->hot_add_disk == NULL)
                        return -EINVAL;
-                rdev_for_each(rdev2, tmp, rdev->mddev)
+                list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
                        if (rdev2->raid_disk == slot)
                                return -EEXIST;
@@ -2125,14 +2212,14 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                 */
                mddev_t *mddev;
                int overlap = 0;
-                struct list_head *tmp, *tmp2;
+                struct list_head *tmp;
                mddev_unlock(my_mddev);
                for_each_mddev(mddev, tmp) {
                        mdk_rdev_t *rdev2;
                        mddev_lock(mddev);
-                        rdev_for_each(rdev2, tmp2, mddev)
+                        list_for_each_entry(rdev2, &mddev->disks, same_set)
                                if (test_bit(AllReserved, &rdev2->flags) ||
                                    (rdev->bdev == rdev2->bdev &&
                                     rdev != rdev2 &&
@@ -2328,8 +2415,7 @@ abort_free:
 static void analyze_sbs(mddev_t * mddev)
 {
        int i;
-        struct list_head *tmp;
+        mdk_rdev_t *rdev, *freshest, *tmp;
-        mdk_rdev_t *rdev, *freshest;
        char b[BDEVNAME_SIZE];
        freshest = NULL;
@@ -3046,7 +3132,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
        }
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
-        sysfs_notify(&mddev->kobj, NULL, "sync_action");
+        sysfs_notify_dirent(mddev->sysfs_action);
        return len;
 }
@@ -3404,6 +3490,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
        rv = mddev_lock(mddev);
+        if (mddev->hold_active == UNTIL_IOCTL)
+                mddev->hold_active = 0;
        if (!rv) {
                rv = entry->store(mddev, page, length);
                mddev_unlock(mddev);
@@ -3414,6 +3502,17 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 static void md_free(struct kobject *ko)
 {
        mddev_t *mddev = container_of(ko, mddev_t, kobj);
+        if (mddev->sysfs_state)
+                sysfs_put(mddev->sysfs_state);
+        if (mddev->gendisk) {
+                del_gendisk(mddev->gendisk);
+                put_disk(mddev->gendisk);
+        }
+        if (mddev->queue)
+                blk_cleanup_queue(mddev->queue);
        kfree(mddev);
 }
@@ -3429,34 +3528,74 @@ static struct kobj_type md_ktype = {
 int mdp_major = 0;
-static struct kobject *md_probe(dev_t dev, int *part, void *data)
+static int md_alloc(dev_t dev, char *name)
 {
        static DEFINE_MUTEX(disks_mutex);
        mddev_t *mddev = mddev_find(dev);
        struct gendisk *disk;
-        int partitioned = (MAJOR(dev) != MD_MAJOR);
+        int partitioned;
-        int shift = partitioned ? MdpMinorShift : 0;
+        int shift;
-        int unit = MINOR(dev) >> shift;
+        int unit;
        int error;
        if (!mddev)
-                return NULL;
+                return -ENODEV;
+        partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
+        shift = partitioned ? MdpMinorShift : 0;
+        unit = MINOR(mddev->unit) >> shift;
+        /* wait for any previous instance if this device
+         * to be completed removed (mddev_delayed_delete).
+         */
+        flush_scheduled_work();
        mutex_lock(&disks_mutex);
        if (mddev->gendisk) {
                mutex_unlock(&disks_mutex);
                mddev_put(mddev);
-                return NULL;
+                return -EEXIST;
+        }
+        if (name) {
+                /* Need to ensure that 'name' is not a duplicate.
+                 */
+                mddev_t *mddev2;
+                spin_lock(&all_mddevs_lock);
+                list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
+                        if (mddev2->gendisk &&
+                            strcmp(mddev2->gendisk->disk_name, name) == 0) {
+                                spin_unlock(&all_mddevs_lock);
+                                return -EEXIST;
+                        }
+                spin_unlock(&all_mddevs_lock);
+        }
+        mddev->queue = blk_alloc_queue(GFP_KERNEL);
+        if (!mddev->queue) {
+                mutex_unlock(&disks_mutex);
+                mddev_put(mddev);
+                return -ENOMEM;
        }
+        /* Can be unlocked because the queue is new: no concurrency */
+        queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
+        blk_queue_make_request(mddev->queue, md_fail_request);
        disk = alloc_disk(1 << shift);
        if (!disk) {
                mutex_unlock(&disks_mutex);
+                blk_cleanup_queue(mddev->queue);
+                mddev->queue = NULL;
                mddev_put(mddev);
-                return NULL;
+                return -ENOMEM;
        }
-        disk->major = MAJOR(dev);
+        disk->major = MAJOR(mddev->unit);
        disk->first_minor = unit << shift;
-        if (partitioned)
+        if (name)
+                strcpy(disk->disk_name, name);
+        else if (partitioned)
                sprintf(disk->disk_name, "md_d%d", unit);
        else
                sprintf(disk->disk_name, "md%d", unit);
@@ -3464,7 +3603,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
        disk->private_data = mddev;
        disk->queue = mddev->queue;
        /* Allow extended partitions.  This makes the
-         * 'mdp' device redundant, but we can really
+         * 'mdp' device redundant, but we can't really
         * remove it now.
         */
        disk->flags |= GENHD_FL_EXT_DEVT;
@@ -3480,9 +3619,35 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
                kobject_uevent(&mddev->kobj, KOBJ_ADD);
                mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
        }
+        mddev_put(mddev);
+        return 0;
+}
+static struct kobject *md_probe(dev_t dev, int *part, void *data)
+{
+        md_alloc(dev, NULL);
        return NULL;
 }
+static int add_named_array(const char *val, struct kernel_param *kp)
+{
+        /* val must be "md_*" where * is not all digits.
+         * We allocate an array with a large free minor number, and
+         * set the name to val.  val must not already be an active name.
+         */
+        int len = strlen(val);
+        char buf[DISK_NAME_LEN];
+        while (len && val[len-1] == '\n')
+                len--;
+        if (len >= DISK_NAME_LEN)
+                return -E2BIG;
+        strlcpy(buf, val, len+1);
+        if (strncmp(buf, "md_", 3) != 0)
+                return -EINVAL;
+        return md_alloc(0, buf);
+}
 static void md_safemode_timeout(unsigned long data)
 {
        mddev_t *mddev = (mddev_t *) data;
@@ -3501,7 +3666,6 @@ static int do_md_run(mddev_t * mddev)
 {
        int err;
        int chunk_size;
-        struct list_head *tmp;
        mdk_rdev_t *rdev;
        struct gendisk *disk;
        struct mdk_personality *pers;
@@ -3540,7 +3704,7 @@ static int do_md_run(mddev_t * mddev)
                }
                /* devices must have minimum size of one chunk */
-                rdev_for_each(rdev, tmp, mddev) {
+                list_for_each_entry(rdev, &mddev->disks, same_set) {
                        if (test_bit(Faulty, &rdev->flags))
                                continue;
                        if (rdev->size < chunk_size / 1024) {
@@ -3565,7 +3729,7 @@ static int do_md_run(mddev_t * mddev)
         * the only valid external interface is through the md
         * device.
         */
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (test_bit(Faulty, &rdev->flags))
                        continue;
                sync_blockdev(rdev->bdev);
@@ -3630,10 +3794,10 @@ static int do_md_run(mddev_t * mddev)
                 */
                char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
                mdk_rdev_t *rdev2;
-                struct list_head *tmp2;
                int warned = 0;
-                rdev_for_each(rdev, tmp, mddev) {
-                        rdev_for_each(rdev2, tmp2, mddev) {
+                list_for_each_entry(rdev, &mddev->disks, same_set)
+                        list_for_each_entry(rdev2, &mddev->disks, same_set) {
                                if (rdev < rdev2 &&
                                    rdev->bdev->bd_contains ==
                                    rdev2->bdev->bd_contains) {
@@ -3647,7 +3811,7 @@ static int do_md_run(mddev_t * mddev)
                                        warned = 1;
                                }
                        }
-                }
                if (warned)
                        printk(KERN_WARNING
                               "True protection against single-disk"
@@ -3684,6 +3848,7 @@ static int do_md_run(mddev_t * mddev)
                        printk(KERN_WARNING
                               "md: cannot register extra attributes for %s\n",
                               mdname(mddev));
+                mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
        } else if (mddev->ro == 2) /* auto-readonly not meaningful */
                mddev->ro = 0;
@@ -3694,7 +3859,7 @@ static int do_md_run(mddev_t * mddev)
        mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
        mddev->in_sync = 1;
-        rdev_for_each(rdev, tmp, mddev)
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0) {
                        char nm[20];
                        sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3725,9 +3890,8 @@ static int do_md_run(mddev_t * mddev)
         * it will remove the drives and not do the right thing
         */
        if (mddev->degraded && !mddev->sync_thread) {
-                struct list_head *rtmp;
                int spares = 0;
-                rdev_for_each(rdev, rtmp, mddev)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0 &&
                            !test_bit(In_sync, &rdev->flags) &&
                            !test_bit(Faulty, &rdev->flags))
@@ -3754,7 +3918,8 @@ static int do_md_run(mddev_t * mddev)
        mddev->changed = 1;
        md_new_event(mddev);
        sysfs_notify_dirent(mddev->sysfs_state);
-        sysfs_notify(&mddev->kobj, NULL, "sync_action");
+        if (mddev->sysfs_action)
+                sysfs_notify_dirent(mddev->sysfs_action);
        sysfs_notify(&mddev->kobj, NULL, "degraded");
        kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
        return 0;
@@ -3854,9 +4019,12 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                        mddev->queue->merge_bvec_fn = NULL;
                        mddev->queue->unplug_fn = NULL;
                        mddev->queue->backing_dev_info.congested_fn = NULL;
-                        if (mddev->pers->sync_request)
+                        if (mddev->pers->sync_request) {
                                sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
+                                if (mddev->sysfs_action)
+                                        sysfs_put(mddev->sysfs_action);
+                                mddev->sysfs_action = NULL;
+                        }
                        module_put(mddev->pers->owner);
                        mddev->pers = NULL;
                        /* tell userspace to handle 'inactive' */
@@ -3883,7 +4051,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
         */
        if (mode == 0) {
                mdk_rdev_t *rdev;
-                struct list_head *tmp;
                printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
@@ -3895,7 +4062,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                }
                mddev->bitmap_offset = 0;
-                rdev_for_each(rdev, tmp, mddev)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0) {
                                char nm[20];
                                sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3941,6 +4108,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                mddev->barriers_work = 0;
                mddev->safemode = 0;
                kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+                if (mddev->hold_active == UNTIL_STOP)
+                        mddev->hold_active = 0;
        } else if (mddev->pers)
                printk(KERN_INFO "md: %s switched to read-only mode.\n",
@@ -3956,7 +4125,6 @@ out:
 static void autorun_array(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
-        struct list_head *tmp;
        int err;
        if (list_empty(&mddev->disks))
@@ -3964,7 +4132,7 @@ static void autorun_array(mddev_t *mddev)
        printk(KERN_INFO "md: running: ");
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                char b[BDEVNAME_SIZE];
                printk("<%s>", bdevname(rdev->bdev,b));
        }
@@ -3991,8 +4159,7 @@ static void autorun_array(mddev_t *mddev)
 */
 static void autorun_devices(int part)
 {
-        struct list_head *tmp;
+        mdk_rdev_t *rdev0, *rdev, *tmp;
-        mdk_rdev_t *rdev0, *rdev;
        mddev_t *mddev;
        char b[BDEVNAME_SIZE];
@@ -4007,7 +4174,7 @@ static void autorun_devices(int part)
                printk(KERN_INFO "md: considering %s ...\n",
                        bdevname(rdev0->bdev,b));
                INIT_LIST_HEAD(&candidates);
-                rdev_for_each_list(rdev, tmp, pending_raid_disks)
+                rdev_for_each_list(rdev, tmp, &pending_raid_disks)
                        if (super_90_load(rdev, rdev0, 0) >= 0) {
                                printk(KERN_INFO "md:  adding %s ...\n",
                                        bdevname(rdev->bdev,b));
@@ -4053,7 +4220,7 @@ static void autorun_devices(int part)
                } else {
                        printk(KERN_INFO "md: created %s\n", mdname(mddev));
                        mddev->persistent = 1;
-                        rdev_for_each_list(rdev, tmp, candidates) {
+                        rdev_for_each_list(rdev, tmp, &candidates) {
                                list_del_init(&rdev->same_set);
                                if (bind_rdev_to_array(rdev, mddev))
                                        export_rdev(rdev);
@@ -4064,7 +4231,7 @@ static void autorun_devices(int part)
                /* on success, candidates will be empty, on error
                 * it won't...
                 */
-                rdev_for_each_list(rdev, tmp, candidates) {
+                rdev_for_each_list(rdev, tmp, &candidates) {
                        list_del_init(&rdev->same_set);
                        export_rdev(rdev);
                }
@@ -4093,10 +4260,9 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
        mdu_array_info_t info;
        int nr,working,active,failed,spare;
        mdk_rdev_t *rdev;
-        struct list_head *tmp;
        nr=working=active=failed=spare=0;
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                nr++;
                if (test_bit(Faulty, &rdev->flags))
                        failed++;
@@ -4614,9 +4780,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 static int update_size(mddev_t *mddev, sector_t num_sectors)
 {
-        mdk_rdev_t * rdev;
+        mdk_rdev_t *rdev;
        int rv;
-        struct list_head *tmp;
        int fit = (num_sectors == 0);
        if (mddev->pers->resize == NULL)
@@ -4638,7 +4803,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
                 * grow, and re-add.
                 */
                return -EBUSY;
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                sector_t avail;
                avail = rdev->size * 2;
@@ -5000,6 +5165,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 done_unlock:
 abort_unlock:
+        if (mddev->hold_active == UNTIL_IOCTL &&
+            err != -EINVAL)
+                mddev->hold_active = 0;
        mddev_unlock(mddev);
        return err;
@@ -5016,14 +5184,25 @@ static int md_open(struct block_device *bdev, fmode_t mode)
         * Succeed if we can lock the mddev, which confirms that
         * it isn't being stopped right now.
         */
-        mddev_t *mddev = bdev->bd_disk->private_data;
+        mddev_t *mddev = mddev_find(bdev->bd_dev);
        int err;
+        if (mddev->gendisk != bdev->bd_disk) {
+                /* we are racing with mddev_put which is discarding this
+                 * bd_disk.
+                 */
+                mddev_put(mddev);
+                /* Wait until bdev->bd_disk is definitely gone */
+                flush_scheduled_work();
+                /* Then retry the open from the top */
+                return -ERESTARTSYS;
+        }
+        BUG_ON(mddev != bdev->bd_disk->private_data);
        if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
                goto out;
        err = 0;
-        mddev_get(mddev);
        atomic_inc(&mddev->openers);
        mddev_unlock(mddev);
@@ -5187,11 +5366,10 @@ static void status_unused(struct seq_file *seq)
 {
        int i = 0;
        mdk_rdev_t *rdev;
-        struct list_head *tmp;
        seq_printf(seq, "unused devices: ");
-        rdev_for_each_list(rdev, tmp, pending_raid_disks) {
+        list_for_each_entry(rdev, &pending_raid_disks, same_set) {
                char b[BDEVNAME_SIZE];
                i++;
                seq_printf(seq, "%s ",
@@ -5350,7 +5528,6 @@ static int md_seq_show(struct seq_file *seq, void *v)
 {
        mddev_t *mddev = v;
        sector_t size;
-        struct list_head *tmp2;
        mdk_rdev_t *rdev;
        struct mdstat_info *mi = seq->private;
        struct bitmap *bitmap;
@@ -5387,7 +5564,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                }
                size = 0;
-                rdev_for_each(rdev, tmp2, mddev) {
+                list_for_each_entry(rdev, &mddev->disks, same_set) {
                        char b[BDEVNAME_SIZE];
                        seq_printf(seq, " %s[%d]",
                                bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -5694,7 +5871,6 @@ void md_do_sync(mddev_t *mddev)
        struct list_head *tmp;
        sector_t last_check;
        int skipped = 0;
-        struct list_head *rtmp;
        mdk_rdev_t *rdev;
        char *desc;
@@ -5799,7 +5975,7 @@ void md_do_sync(mddev_t *mddev)
                /* recovery follows the physical size of devices */
                max_sectors = mddev->size << 1;
                j = MaxSector;
-                rdev_for_each(rdev, rtmp, mddev)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0 &&
                            !test_bit(Faulty, &rdev->flags) &&
                            !test_bit(In_sync, &rdev->flags) &&
@@ -5949,7 +6125,7 @@ void md_do_sync(mddev_t *mddev)
                } else {
                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
                                mddev->curr_resync = MaxSector;
-                        rdev_for_each(rdev, rtmp, mddev)
+                        list_for_each_entry(rdev, &mddev->disks, same_set)
                                if (rdev->raid_disk >= 0 &&
                                    !test_bit(Faulty, &rdev->flags) &&
                                    !test_bit(In_sync, &rdev->flags) &&
@@ -5985,10 +6161,9 @@ EXPORT_SYMBOL_GPL(md_do_sync);
 static int remove_and_add_spares(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
-        struct list_head *rtmp;
        int spares = 0;
-        rdev_for_each(rdev, rtmp, mddev)
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0 &&
                    !test_bit(Blocked, &rdev->flags) &&
                    (test_bit(Faulty, &rdev->flags) ||
@@ -6003,8 +6178,8 @@ static int remove_and_add_spares(mddev_t *mddev)
                        }
                }
-        if (mddev->degraded && ! mddev->ro) {
+        if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
-                rdev_for_each(rdev, rtmp, mddev) {
+                list_for_each_entry(rdev, &mddev->disks, same_set) {
                        if (rdev->raid_disk >= 0 &&
                            !test_bit(In_sync, &rdev->flags) &&
                            !test_bit(Blocked, &rdev->flags))
@@ -6056,7 +6231,6 @@ static int remove_and_add_spares(mddev_t *mddev)
 void md_check_recovery(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
-        struct list_head *rtmp;
        if (mddev->bitmap)
@@ -6120,7 +6294,7 @@ void md_check_recovery(mddev_t *mddev)
                if (mddev->flags)
                        md_update_sb(mddev, 0);
-                rdev_for_each(rdev, rtmp, mddev)
+                list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (test_and_clear_bit(StateChanged, &rdev->flags))
                                sysfs_notify_dirent(rdev->sysfs_state);
@@ -6149,13 +6323,13 @@ void md_check_recovery(mddev_t *mddev)
                         * information must be scrapped
                         */
                        if (!mddev->degraded)
-                                rdev_for_each(rdev, rtmp, mddev)
+                                list_for_each_entry(rdev, &mddev->disks, same_set)
                                        rdev->saved_raid_disk = -1;
                        mddev->recovery = 0;
                        /* flag recovery needed just to double check */
                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-                        sysfs_notify(&mddev->kobj, NULL, "sync_action");
+                        sysfs_notify_dirent(mddev->sysfs_action);
                        md_new_event(mddev);
                        goto unlock;
                }
@@ -6216,7 +6390,7 @@ void md_check_recovery(mddev_t *mddev)
                                mddev->recovery = 0;
                        } else
                                md_wakeup_thread(mddev->sync_thread);
-                        sysfs_notify(&mddev->kobj, NULL, "sync_action");
+                        sysfs_notify_dirent(mddev->sysfs_action);
                        md_new_event(mddev);
                }
        unlock:
@@ -6224,7 +6398,8 @@ void md_check_recovery(mddev_t *mddev)
                        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
                        if (test_and_clear_bit(MD_RECOVERY_RECOVER,
                                               &mddev->recovery))
-                                sysfs_notify(&mddev->kobj, NULL, "sync_action");
+                                if (mddev->sysfs_action)
+                                        sysfs_notify_dirent(mddev->sysfs_action);
                }
                mddev_unlock(mddev);
        }
@@ -6386,14 +6561,8 @@ static __exit void md_exit(void)
        unregister_sysctl_table(raid_table_header);
        remove_proc_entry("mdstat", NULL);
        for_each_mddev(mddev, tmp) {
-                struct gendisk *disk = mddev->gendisk;
-                if (!disk)
-                        continue;
                export_array(mddev);
-                del_gendisk(disk);
+                mddev->hold_active = 0;
-                put_disk(disk);
-                mddev->gendisk = NULL;
-                mddev_put(mddev);
        }
 }
@@ -6418,6 +6587,7 @@ static int set_ro(const char *val, struct kernel_param *kp)
 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
+module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
 EXPORT_SYMBOL(register_md_personality);
 EXPORT_SYMBOL(unregister_md_personality);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index d4ac47d11279..f6d08f241671 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -408,7 +408,6 @@ static int multipath_run (mddev_t *mddev)
        int disk_idx;
        struct multipath_info *disk;
        mdk_rdev_t *rdev;
-        struct list_head *tmp;
        if (mddev->level != LEVEL_MULTIPATH) {
                printk("multipath: %s: raid level not set to multipath IO (%d)\n",
@@ -441,7 +440,7 @@ static int multipath_run (mddev_t *mddev)
        }
        conf->working_disks = 0;
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                disk_idx = rdev->raid_disk;
                if (disk_idx < 0 ||
                    disk_idx >= mddev->raid_disks)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 8ac6488ad0dc..c605ba805586 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -53,11 +53,10 @@ static int raid0_congested(void *data, int bits)
 static int create_strip_zones (mddev_t *mddev)
 {
        int i, c, j;
-        sector_t current_offset, curr_zone_offset;
+        sector_t current_start, curr_zone_start;
        sector_t min_spacing;
        raid0_conf_t *conf = mddev_to_conf(mddev);
        mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
-        struct list_head *tmp1, *tmp2;
        struct strip_zone *zone;
        int cnt;
        char b[BDEVNAME_SIZE];
@@ -67,19 +66,19 @@ static int create_strip_zones (mddev_t *mddev)
         */
        conf->nr_strip_zones = 0;
 
-        rdev_for_each(rdev1, tmp1, mddev) {
+        list_for_each_entry(rdev1, &mddev->disks, same_set) {
-                printk("raid0: looking at %s\n",
+                printk(KERN_INFO "raid0: looking at %s\n",
                        bdevname(rdev1->bdev,b));
                c = 0;
-                rdev_for_each(rdev2, tmp2, mddev) {
+                list_for_each_entry(rdev2, &mddev->disks, same_set) {
-                        printk("raid0:   comparing %s(%llu)",
+                        printk(KERN_INFO "raid0:   comparing %s(%llu)",
                               bdevname(rdev1->bdev,b),
                               (unsigned long long)rdev1->size);
-                        printk(" with %s(%llu)\n",
+                        printk(KERN_INFO " with %s(%llu)\n",
                               bdevname(rdev2->bdev,b),
                               (unsigned long long)rdev2->size);
                        if (rdev2 == rdev1) {
-                                printk("raid0:   END\n");
+                                printk(KERN_INFO "raid0:   END\n");
                                break;
                        }
                        if (rdev2->size == rdev1->size)
@@ -88,19 +87,20 @@ static int create_strip_zones (mddev_t *mddev)
                                 * Not unique, don't count it as a new
                                 * group
                                 */
-                                printk("raid0:   EQUAL\n");
+                                printk(KERN_INFO "raid0:   EQUAL\n");
                                c = 1;
                                break;
                        }
-                        printk("raid0:   NOT EQUAL\n");
+                        printk(KERN_INFO "raid0:   NOT EQUAL\n");
                }
                if (!c) {
-                        printk("raid0:   ==> UNIQUE\n");
+                        printk(KERN_INFO "raid0:   ==> UNIQUE\n");
                        conf->nr_strip_zones++;
-                        printk("raid0: %d zones\n", conf->nr_strip_zones);
+                        printk(KERN_INFO "raid0: %d zones\n",
+                                conf->nr_strip_zones);
                }
        }
-        printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
+        printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones);
        conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
                                conf->nr_strip_zones, GFP_KERNEL);
@@ -119,16 +119,17 @@ static int create_strip_zones (mddev_t *mddev)
        cnt = 0;
        smallest = NULL;
        zone->dev = conf->devlist;
-        rdev_for_each(rdev1, tmp1, mddev) {
+        list_for_each_entry(rdev1, &mddev->disks, same_set) {
                int j = rdev1->raid_disk;
                if (j < 0 || j >= mddev->raid_disks) {
-                        printk("raid0: bad disk number %d - aborting!\n", j);
+                        printk(KERN_ERR "raid0: bad disk number %d - "
+                                "aborting!\n", j);
                        goto abort;
                }
                if (zone->dev[j]) {
-                        printk("raid0: multiple devices for %d - aborting!\n",
+                        printk(KERN_ERR "raid0: multiple devices for %d - "
-                                j);
+                                "aborting!\n", j);
                        goto abort;
                }
                zone->dev[j] = rdev1;
@@ -149,16 +150,16 @@ static int create_strip_zones (mddev_t *mddev)
                cnt++;
        }
        if (cnt != mddev->raid_disks) {
-                printk("raid0: too few disks (%d of %d) - aborting!\n",
+                printk(KERN_ERR "raid0: too few disks (%d of %d) - "
-                        cnt, mddev->raid_disks);
+                        "aborting!\n", cnt, mddev->raid_disks);
                goto abort;
        }
        zone->nb_dev = cnt;
-        zone->size = smallest->size * cnt;
+        zone->sectors = smallest->size * cnt * 2;
-        zone->zone_offset = 0;
+        zone->zone_start = 0;
-        current_offset = smallest->size;
+        current_start = smallest->size * 2;
-        curr_zone_offset = zone->size;
+        curr_zone_start = zone->sectors;
        /* now do the other zones */
        for (i = 1; i < conf->nr_strip_zones; i++)
@@ -166,40 +167,41 @@ static int create_strip_zones (mddev_t *mddev)
                zone = conf->strip_zone + i;
                zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
-                printk("raid0: zone %d\n", i);
+                printk(KERN_INFO "raid0: zone %d\n", i);
-                zone->dev_offset = current_offset;
+                zone->dev_start = current_start;
                smallest = NULL;
                c = 0;
                for (j=0; j<cnt; j++) {
                        char b[BDEVNAME_SIZE];
                        rdev = conf->strip_zone[0].dev[j];
-                        printk("raid0: checking %s ...", bdevname(rdev->bdev,b));
+                        printk(KERN_INFO "raid0: checking %s ...",
-                        if (rdev->size > current_offset)
+                                bdevname(rdev->bdev, b));
-                        {
+                        if (rdev->size > current_start / 2) {
-                                printk(" contained as device %d\n", c);
+                                printk(KERN_INFO " contained as device %d\n",
+                                        c);
                                zone->dev[c] = rdev;
                                c++;
                                if (!smallest || (rdev->size <smallest->size)) {
                                        smallest = rdev;
-                                        printk("  (%llu) is smallest!.\n", 
+                                        printk(KERN_INFO "  (%llu) is smallest!.\n",
                                                (unsigned long long)rdev->size);
                                }
                        } else
-                                printk(" nope.\n");
+                                printk(KERN_INFO " nope.\n");
                }
                zone->nb_dev = c;
-                zone->size = (smallest->size - current_offset) * c;
+                zone->sectors = (smallest->size * 2 - current_start) * c;
-                printk("raid0: zone->nb_dev: %d, size: %llu\n",
+                printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
-                        zone->nb_dev, (unsigned long long)zone->size);
+                        zone->nb_dev, (unsigned long long)zone->sectors);
-                zone->zone_offset = curr_zone_offset;
+                zone->zone_start = curr_zone_start;
-                curr_zone_offset += zone->size;
+                curr_zone_start += zone->sectors;
-                current_offset = smallest->size;
+                current_start = smallest->size * 2;
-                printk("raid0: current zone offset: %llu\n",
+                printk(KERN_INFO "raid0: current zone start: %llu\n",
-                        (unsigned long long)current_offset);
+                        (unsigned long long)current_start);
        }
        /* Now find appropriate hash spacing.
@@ -210,16 +212,16 @@ static int create_strip_zones (mddev_t *mddev)
         * strip though as it's size has no bearing on the efficacy of the hash
         * table.
         */
-        conf->hash_spacing = curr_zone_offset;
+        conf->spacing = curr_zone_start;
-        min_spacing = curr_zone_offset;
+        min_spacing = curr_zone_start;
        sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
        for (i=0; i < conf->nr_strip_zones-1; i++) {
-                sector_t sz = 0;
+                sector_t s = 0;
-                for (j=i; j<conf->nr_strip_zones-1 &&
+                for (j = i; j < conf->nr_strip_zones - 1 &&
-                             sz < min_spacing ; j++)
+                                s < min_spacing; j++)
-                        sz += conf->strip_zone[j].size;
+                        s += conf->strip_zone[j].sectors;
-                if (sz >= min_spacing && sz < conf->hash_spacing)
+                if (s >= min_spacing && s < conf->spacing)
-                        conf->hash_spacing = sz;
+                        conf->spacing = s;
        }
        mddev->queue->unplug_fn = raid0_unplug;
@@ -227,7 +229,7 @@ static int create_strip_zones (mddev_t *mddev)
        mddev->queue->backing_dev_info.congested_fn = raid0_congested;
        mddev->queue->backing_dev_info.congested_data = mddev;
-        printk("raid0: done.\n");
+        printk(KERN_INFO "raid0: done.\n");
        return 0;
 abort:
        return 1;
@@ -262,10 +264,9 @@ static int raid0_mergeable_bvec(struct request_queue *q,
 static int raid0_run (mddev_t *mddev)
 {
        unsigned  cur=0, i=0, nb_zone;
-        s64 size;
+        s64 sectors;
        raid0_conf_t *conf;
        mdk_rdev_t *rdev;
-        struct list_head *tmp;
        if (mddev->chunk_size == 0) {
                printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
@@ -291,54 +292,54 @@ static int raid0_run (mddev_t *mddev)
        /* calculate array device size */
        mddev->array_sectors = 0;
-        rdev_for_each(rdev, tmp, mddev)
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                mddev->array_sectors += rdev->size * 2;
-        printk("raid0 : md_size is %llu blocks.\n", 
+        printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
-                (unsigned long long)mddev->array_sectors / 2);
+                (unsigned long long)mddev->array_sectors);
-        printk("raid0 : conf->hash_spacing is %llu blocks.\n",
+        printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
-                (unsigned long long)conf->hash_spacing);
+                (unsigned long long)conf->spacing);
        {
-                sector_t s = mddev->array_sectors / 2;
+                sector_t s = mddev->array_sectors;
-                sector_t space = conf->hash_spacing;
+                sector_t space = conf->spacing;
                int round;
-                conf->preshift = 0;
+                conf->sector_shift = 0;
                if (sizeof(sector_t) > sizeof(u32)) {
                        /*shift down space and s so that sector_div will work */
                        while (space > (sector_t) (~(u32)0)) {
                                s >>= 1;
                                space >>= 1;
                                s += 1; /* force round-up */
-                                conf->preshift++;
+                                conf->sector_shift++;
                        }
                }
                round = sector_div(s, (u32)space) ? 1 : 0;
                nb_zone = s + round;
        }
-        printk("raid0 : nb_zone is %d.\n", nb_zone);
+        printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone);
-        printk("raid0 : Allocating %Zd bytes for hash.\n",
+        printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n",
                                nb_zone*sizeof(struct strip_zone*));
        conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
        if (!conf->hash_table)
                goto out_free_conf;
-        size = conf->strip_zone[cur].size;
+        sectors = conf->strip_zone[cur].sectors;
        conf->hash_table[0] = conf->strip_zone + cur;
        for (i=1; i< nb_zone; i++) {
-                while (size <= conf->hash_spacing) {
+                while (sectors <= conf->spacing) {
                        cur++;
-                        size += conf->strip_zone[cur].size;
+                        sectors += conf->strip_zone[cur].sectors;
                }
-                size -= conf->hash_spacing;
+                sectors -= conf->spacing;
                conf->hash_table[i] = conf->strip_zone + cur;
        }
-        if (conf->preshift) {
+        if (conf->sector_shift) {
-                conf->hash_spacing >>= conf->preshift;
+                conf->spacing >>= conf->sector_shift;
-                /* round hash_spacing up so when we divide by it, we
+                /* round spacing up so when we divide by it, we
                 * err on the side of too-low, which is safest
                 */
-                conf->hash_spacing++;
+                conf->spacing++;
        }
        /* calculate the max read-ahead size.
@@ -387,12 +388,12 @@ static int raid0_stop (mddev_t *mddev)
 static int raid0_make_request (struct request_queue *q, struct bio *bio)
 {
        mddev_t *mddev = q->queuedata;
-        unsigned int sect_in_chunk, chunksize_bits,  chunk_size, chunk_sects;
+        unsigned int sect_in_chunk, chunksect_bits, chunk_sects;
        raid0_conf_t *conf = mddev_to_conf(mddev);
        struct strip_zone *zone;
        mdk_rdev_t *tmp_dev;
        sector_t chunk;
-        sector_t block, rsect;
+        sector_t sector, rsect;
        const int rw = bio_data_dir(bio);
        int cpu;
@@ -407,11 +408,9 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
                      bio_sectors(bio));
        part_stat_unlock();
-        chunk_size = mddev->chunk_size >> 10;
        chunk_sects = mddev->chunk_size >> 9;
-        chunksize_bits = ffz(~chunk_size);
+        chunksect_bits = ffz(~chunk_sects);
-        block = bio->bi_sector >> 1;
+        sector = bio->bi_sector;
-        
        if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) {
                struct bio_pair *bp;
@@ -434,28 +433,27 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 
        {
-                sector_t x = block >> conf->preshift;
+                sector_t x = sector >> conf->sector_shift;
-                sector_div(x, (u32)conf->hash_spacing);
+                sector_div(x, (u32)conf->spacing);
                zone = conf->hash_table[x];
        }
- 
-        while (block >= (zone->zone_offset + zone->size)) 
+        while (sector >= zone->zone_start + zone->sectors)
                zone++;
-    
-        sect_in_chunk = bio->bi_sector & ((chunk_size<<1) -1);
+        sect_in_chunk = bio->bi_sector & (chunk_sects - 1);
        {
-                sector_t x =  (block - zone->zone_offset) >> chunksize_bits;
+                sector_t x = (sector - zone->zone_start) >> chunksect_bits;
                sector_div(x, zone->nb_dev);
                chunk = x;
-                x = block >> chunksize_bits;
+                x = sector >> chunksect_bits;
                tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
        }
-        rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1)
+        rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk;
-                + sect_in_chunk;
 
        bio->bi_bdev = tmp_dev->bdev;
        bio->bi_sector = rsect + tmp_dev->data_offset;
@@ -467,7 +465,7 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 bad_map:
        printk("raid0_make_request bug: can't convert block across chunks"
-                " or bigger than %dk %llu %d\n", chunk_size, 
+                " or bigger than %dk %llu %d\n", chunk_sects / 2,
                (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
        bio_io_error(bio);
@@ -492,10 +490,10 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
                        seq_printf(seq, "%s/", bdevname(
                                conf->strip_zone[j].dev[k]->bdev,b));
-                seq_printf(seq, "] zo=%d do=%d s=%d\n",
+                seq_printf(seq, "] zs=%d ds=%d s=%d\n",
-                                conf->strip_zone[j].zone_offset,
+                                conf->strip_zone[j].zone_start,
-                                conf->strip_zone[j].dev_offset,
+                                conf->strip_zone[j].dev_start,
-                                conf->strip_zone[j].size);
+                                conf->strip_zone[j].sectors);
        }
 #endif
        seq_printf(seq, " %dk chunks", mddev->chunk_size/1024);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 9c788e2489b1..7b4f5f7155d8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1016,12 +1016,16 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
         * else mark the drive as failed
         */
        if (test_bit(In_sync, &rdev->flags)
-            && (conf->raid_disks - mddev->degraded) == 1)
+            && (conf->raid_disks - mddev->degraded) == 1) {
                /*
                 * Don't fail the drive, act as though we were just a
-                 * normal single drive
+                 * normal single drive.
+                 * However don't try a recovery from this drive as
+                 * it is very likely to fail.
                 */
+                mddev->recovery_disabled = 1;
                return;
+        }
        if (test_and_clear_bit(In_sync, &rdev->flags)) {
                unsigned long flags;
                spin_lock_irqsave(&conf->device_lock, flags);
@@ -1919,7 +1923,6 @@ static int run(mddev_t *mddev)
        int i, j, disk_idx;
        mirror_info_t *disk;
        mdk_rdev_t *rdev;
-        struct list_head *tmp;
        if (mddev->level != 1) {
                printk("raid1: %s: raid level not set to mirroring (%d)\n",
@@ -1964,7 +1967,7 @@ static int run(mddev_t *mddev)
        spin_lock_init(&conf->device_lock);
        mddev->queue->queue_lock = &conf->device_lock;
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                disk_idx = rdev->raid_disk;
                if (disk_idx >= mddev->raid_disks
                    || disk_idx < 0)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 970a96ef9b18..6736d6dff981 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2025,7 +2025,6 @@ static int run(mddev_t *mddev)
        int i, disk_idx;
        mirror_info_t *disk;
        mdk_rdev_t *rdev;
-        struct list_head *tmp;
        int nc, fc, fo;
        sector_t stride, size;
@@ -2108,7 +2107,7 @@ static int run(mddev_t *mddev)
        spin_lock_init(&conf->device_lock);
        mddev->queue->queue_lock = &conf->device_lock;
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                disk_idx = rdev->raid_disk;
                if (disk_idx >= mddev->raid_disks
                    || disk_idx < 0)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a36a7435edf5..a5ba080d303b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3998,7 +3998,6 @@ static int run(mddev_t *mddev)
        int raid_disk, memory;
        mdk_rdev_t *rdev;
        struct disk_info *disk;
-        struct list_head *tmp;
        int working_disks = 0;
        if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
@@ -4108,7 +4107,7 @@ static int run(mddev_t *mddev)
        pr_debug("raid5: run(%s) called.\n", mdname(mddev));
-        rdev_for_each(rdev, tmp, mddev) {
+        list_for_each_entry(rdev, &mddev->disks, same_set) {
                raid_disk = rdev->raid_disk;
                if (raid_disk >= conf->raid_disks
                    || raid_disk < 0)
@@ -4533,7 +4532,6 @@ static int raid5_start_reshape(mddev_t *mddev)
 {
        raid5_conf_t *conf = mddev_to_conf(mddev);
        mdk_rdev_t *rdev;
-        struct list_head *rtmp;
        int spares = 0;
        int added_devices = 0;
        unsigned long flags;
@@ -4541,7 +4539,7 @@ static int raid5_start_reshape(mddev_t *mddev)
        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                return -EBUSY;
-        rdev_for_each(rdev, rtmp, mddev)
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk < 0 &&
                    !test_bit(Faulty, &rdev->flags))
                        spares++;
@@ -4563,7 +4561,7 @@ static int raid5_start_reshape(mddev_t *mddev)
        /* Add some new drives, as many as will fit.
         * We know there are enough to make the newly sized array work.
         */
-        rdev_for_each(rdev, rtmp, mddev)
+        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk < 0 &&
                    !test_bit(Faulty, &rdev->flags)) {
                        if (raid5_add_disk(mddev, rdev) == 0) {